# STUMPY# Copyright 2019 TD Ameritrade. Released under the terms of the 3-Clause BSD license.# STUMPY is a trademark of TD Ameritrade IP Company, Inc. All rights reserved.importmathimportnumpyasnpfrom.importconfig,corefrom.maampedimportmaampedfrom.mmparrayimportmparrayfrom.mstumpimport_get_first_mstump_profile,_get_multi_QT,_mstumpdef_dask_mstumped(dask_client,T_A,T_B,m,excl_zone,M_T,Σ_T,μ_Q,σ_Q,T_subseq_isconstant,Q_subseq_isconstant,include,discords,):""" Compute the multi-dimensional z-normalized matrix profile with a `dask` cluster This is a highly distributed implementation around the Numba JIT-compiled parallelized `_mstump` function which computes the multi-dimensional matrix profile according to STOMP. Note that only self-joins are supported. Parameters ---------- dask_client : client A ``dask`` client. Setting up a ``dask`` cluster is beyond the scope of this library. Please refer to the ``dask`` documentation. T_A : numpy.ndarray The time series or sequence for which to compute the multi-dimensional matrix profile. Each row in `T_A` represents data from the same dimension while each column in `T_A` represents data from a different dimension. T_B : numpy.ndarray The time series or sequence that will be used to annotate T_A. For every subsequence in T_A, its nearest neighbor in T_B will be recorded. m : int Window size excl_zone : int The half width for the exclusion zone relative to the current sliding window M_T : numpy.ndarray Sliding mean of time series, `T` Σ_T : numpy.ndarray Sliding standard deviation of time series, `T` μ_Q : numpy.ndarray Mean of the query sequence, `Q`, relative to the current sliding window σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q`, relative to the current sliding window T_subseq_isconstant : numpy.ndarray A boolearn array representing Rolling isconstant for `T` Q_subseq_isconstant : numpy.ndarray A boolearn array representing Rolling isconstant for `Q` include : numpy.ndarray A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. Returns ------- P : numpy.ndarray The multi-dimensional matrix profile. Each row of the array corresponds to each matrix profile for a given dimension (i.e., the first row is the 1-D matrix profile and the second row is the 2-D matrix profile). I : numpy.ndarray The multi-dimensional matrix profile index where each row of the array corresponds to each matrix profile index for a given dimension. """d,n=T_B.shapel=n-m+1P=np.empty((d,l),dtype=np.float64)I=np.empty((d,l),dtype=np.int64)hosts=list(dask_client.ncores().keys())nworkers=len(hosts)step=int(math.ceil(l/nworkers))forstartinrange(0,l,step):P[:,start],I[:,start]=_get_first_mstump_profile(start,T_A,T_B,m,excl_zone,M_T,Σ_T,μ_Q,σ_Q,T_subseq_isconstant,Q_subseq_isconstant,include,discords,)# Scatter data to Dask clusterT_A_future=dask_client.scatter(T_A,broadcast=True,hash=False)M_T_future=dask_client.scatter(M_T,broadcast=True,hash=False)Σ_T_future=dask_client.scatter(Σ_T,broadcast=True,hash=False)μ_Q_future=dask_client.scatter(μ_Q,broadcast=True,hash=False)σ_Q_future=dask_client.scatter(σ_Q,broadcast=True,hash=False)T_subseq_isconstant_future=dask_client.scatter(T_subseq_isconstant,broadcast=True,hash=False)Q_subseq_isconstant_future=dask_client.scatter(Q_subseq_isconstant,broadcast=True,hash=False)QT_futures=[]QT_first_futures=[]fori,startinenumerate(range(0,l,step)):QT,QT_first=_get_multi_QT(start,T_A,m)QT_future=dask_client.scatter(QT,workers=[hosts[i]],hash=False)QT_first_future=dask_client.scatter(QT_first,workers=[hosts[i]],hash=False)QT_futures.append(QT_future)QT_first_futures.append(QT_first_future)futures=[]fori,startinenumerate(range(0,l,step)):stop=min(l,start+step)futures.append(dask_client.submit(_mstump,T_A_future,m,stop,excl_zone,M_T_future,Σ_T_future,QT_futures[i],QT_first_futures[i],μ_Q_future,σ_Q_future,T_subseq_isconstant_future,Q_subseq_isconstant_future,l,start+1,include,discords,))results=dask_client.gather(futures)fori,startinenumerate(range(0,l,step)):stop=min(l,start+step)P[:,start+1:stop],I[:,start+1:stop]=results[i]returnP,Idef_ray_mstumped(ray_client,T_A,T_B,m,excl_zone,M_T,Σ_T,μ_Q,σ_Q,T_subseq_isconstant,Q_subseq_isconstant,include,discords,):""" Compute the multi-dimensional z-normalized matrix profile with a `ray` cluster This is a highly distributed implementation around the Numba JIT-compiled parallelized `_mstump` function which computes the multi-dimensional matrix profile according to STOMP. Note that only self-joins are supported. Parameters ---------- ray_client : client A `ray` client. Setting up a cluster is beyond the scope of this library. Please refer to the `ray` documentation. T_A : numpy.ndarray The time series or sequence for which to compute the multi-dimensional matrix profile. Each row in `T_A` represents data from the same dimension while each column in `T_A` represents data from a different dimension. T_B : numpy.ndarray The time series or sequence that will be used to annotate T_A. For every subsequence in T_A, its nearest neighbor in T_B will be recorded. m : int Window size excl_zone : int The half width for the exclusion zone relative to the current sliding window M_T : numpy.ndarray Sliding mean of time series, `T` Σ_T : numpy.ndarray Sliding standard deviation of time series, `T` μ_Q : numpy.ndarray Mean of the query sequence, `Q`, relative to the current sliding window σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q`, relative to the current sliding window T_subseq_isconstant : numpy.ndarray A boolearn array representing Rolling isconstant for `T` Q_subseq_isconstant : numpy.ndarray A boolearn array representing Rolling isconstant for `Q` include : numpy.ndarray A list of (zero-based) indices corresponding to the dimensions in `T` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool When set to `True`, this reverses the distance profile to favor discords rather than motifs. Note that indices in `include` are still maintained and respected. Returns ------- P : numpy.ndarray The multi-dimensional matrix profile. Each row of the array corresponds to each matrix profile for a given dimension (i.e., the first row is the 1-D matrix profile and the second row is the 2-D matrix profile). I : numpy.ndarray The multi-dimensional matrix profile index where each row of the array corresponds to each matrix profile index for a given dimension. """core.check_ray(ray_client)d,n=T_B.shapel=n-m+1P=np.empty((d,l),dtype=np.float64)I=np.empty((d,l),dtype=np.int64)nworkers=core.get_ray_nworkers(ray_client)step=int(math.ceil(l/nworkers))forstartinrange(0,l,step):P[:,start],I[:,start]=_get_first_mstump_profile(start,T_A,T_B,m,excl_zone,M_T,Σ_T,μ_Q,σ_Q,T_subseq_isconstant,Q_subseq_isconstant,include,discords,)# Put data into Ray object storageT_A_ref=ray_client.put(T_A)M_T_ref=ray_client.put(M_T)Σ_T_ref=ray_client.put(Σ_T)μ_Q_ref=ray_client.put(μ_Q)σ_Q_ref=ray_client.put(σ_Q)T_subseq_isconstant_ref=ray_client.put(T_subseq_isconstant)Q_subseq_isconstant_ref=ray_client.put(Q_subseq_isconstant)QT_refs=[]QT_first_refs=[]forstartinrange(0,l,step):QT,QT_first=_get_multi_QT(start,T_A,m)QT_ref=ray_client.put(QT)QT_first_ref=ray_client.put(QT_first)QT_refs.append(QT_ref)QT_first_refs.append(QT_first_ref)ray_mstump_func=ray_client.remote(core.deco_ray_tor(_mstump))refs=[]fori,startinenumerate(range(0,l,step)):stop=min(l,start+step)refs.append(ray_mstump_func.remote(T_A_ref,m,stop,excl_zone,M_T_ref,Σ_T_ref,QT_refs[i],QT_first_refs[i],μ_Q_ref,σ_Q_ref,T_subseq_isconstant_ref,Q_subseq_isconstant_ref,l,start+1,include,discords,))results=ray_client.get(refs)fori,startinenumerate(range(0,l,step)):stop=min(l,start+step)P[:,start+1:stop],I[:,start+1:stop]=results[i]returnP,I[docs]@core.non_normalized(maamped,exclude=["normalize","T_subseq_isconstant"],)defmstumped(client,T,m,include=None,discords=False,p=2.0,normalize=True,T_subseq_isconstant=None,):""" Compute the multi-dimensional z-normalized matrix profile with a ``dask``/``ray`` cluster This is a highly distributed implementation around the Numba JIT-compiled parallelized ``_mstump`` function which computes the multi-dimensional matrix profile according to STOMP. Note that only self-joins are supported. Parameters ---------- client : client A ``dask``/``ray`` client. Setting up a cluster is beyond the scope of this library. Please refer to the ``dask``/``ray`` documentation. T : numpy.ndarray The time series or sequence for which to compute the multi-dimensional matrix profile. Each row in ``T`` represents data from the same dimension while each column in ``T`` represents data from a different dimension. m : int Window size. include : list, numpy.ndarray, default None A list of (zero-based) indices corresponding to the dimensions in ``T`` that must be included in the constrained multidimensional motif search. For more information, see Section IV D in: `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ discords : bool, default False When set to ``True``, this reverses the distance matrix which results in a multi-dimensional matrix profile that favors larger matrix profile values (i.e., discords) rather than smaller values (i.e., motifs). Note that indices in `include` are still maintained and respected. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. Minkowski distance is typically used with ``p`` being ``1`` or ``2``, which correspond to the Manhattan distance and the Euclidean distance, respectively. normalize : bool, default True When set to ``True``, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the ``@core.non_normalized`` function decorator. T_subseq_isconstant : numpy.ndarray, function, or list, default None A parameter that is used to show whether a subsequence of a time series in ``T`` is constant (``True``) or not. ``T_subseq_isconstant`` can be a 2D boolean ``numpy.ndarray`` or a function that can be applied to each time series in ``T``. Alternatively, for maximum flexibility, a list (with length equal to the total number of time series) may also be used. In this case, ``T_subseq_isconstant[i]`` corresponds to the ``i``-th time series ``T[i]`` and each element in the list can either be a 1D boolean ``numpy.ndarray``, a function, or ``None``. Returns ------- P : numpy.ndarray The multi-dimensional matrix profile. Each row of the array corresponds to each matrix profile for a given dimension (i.e., the first row is the 1-D matrix profile and the second row is the 2-D matrix profile). I : numpy.ndarray The multi-dimensional matrix profile index where each row of the array corresponds to each matrix profile index for a given dimension. See Also -------- stumpy.mstump : Compute the multi-dimensional z-normalized matrix profile stumpy.subspace : Compute the k-dimensional matrix profile subspace for a given subsequence index and its nearest neighbor index stumpy.mdl : Compute the number of bits needed to compress one array with another using the minimum description length (MDL) Notes ----- `DOI: 10.1109/ICDM.2017.66 \ <https://www.cs.ucr.edu/~eamonn/Motif_Discovery_ICDM.pdf>`__ See mSTAMP Algorithm Examples -------- >>> import stumpy >>> import numpy as np >>> from dask.distributed import Client >>> if __name__ == "__main__": ... with Client() as dask_client: ... stumpy.mstumped( ... dask_client, ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3) (array([[0. , 1.43947142, 0. , 2.69407392, 0.11633857], [0.777905 , 2.36179922, 1.50004632, 2.92246722, 0.777905 ]]), array([[2, 4, 0, 1, 0], [4, 4, 0, 1, 0]])) Alternatively, you can also use `ray` >>> import ray >>> if __name__ == "__main__": >>> ray.init() >>> stumpy.mstumped( ... ray, ... np.array([[584., -11., 23., 79., 1001., 0., -19.], ... [ 1., 2., 4., 8., 16., 0., 32.]]), ... m=3) >>> ray.shutdown() """T_A=TT_B=T_AT_A=core._preprocess(T_A)T_B=core._preprocess(T_B)T_A_subseq_isconstant=T_subseq_isconstantT_A_subseq_isconstant=core.process_isconstant(T_A,m,T_A_subseq_isconstant)T_B_subseq_isconstant=T_A_subseq_isconstantT_A,M_T,Σ_T,T_subseq_isconstant=core.preprocess(T_A,m,T_subseq_isconstant=T_A_subseq_isconstant)T_B,μ_Q,σ_Q,Q_subseq_isconstant=core.preprocess(T_B,m,T_subseq_isconstant=T_B_subseq_isconstant)ifT_A.ndim<=1:# pragma: no covererr=f"T is{T_A.ndim}-dimensional and must be at least 2-dimensional"raiseValueError(f"{err}")# mstump currently only supports self-join. Therefore, the argument `n=T_A.shape[1]`# must be passed to the function `core.check_window_size`.core.check_window_size(m,max_size=min(T_A.shape[1],T_B.shape[1]),n=T_A.shape[1])ifincludeisnotNone:include=core._preprocess_include(include)excl_zone=int(np.ceil(m/config.STUMPY_EXCL_ZONE_DENOM))# See Definition 3 and Figure 3_mstumped=core._client_to_func(client)P,I=_mstumped(client,T_A,T_B,m,excl_zone,M_T,Σ_T,μ_Q,σ_Q,T_subseq_isconstant,Q_subseq_isconstant,include,discords,)returnmparray(P_=P,I_=I)