
R Package for detecting data leakages in time series forecastingcompetitions.
The development version fromGitHubwith:
# install.packages("devtools")devtools::install_github("thiyangt/tsdataleaks")library(tsdataleaks)To demonstrate the package functions, I created a small data set with4 time series.
set.seed(2020)a<-rnorm(15)d<-rnorm(10)lst<-list(a = a,b =c(a[10:15]+rep(8,6),rnorm(10), a[1:5], a[1:5]),c =c(rnorm(10),-a[1:5]),d = d,e = d)find_dataleaks:Exploit data leakslibrary(tsdataleaks)library(magrittr)library(tidyverse)library(viridis)# h - I assume test period length is 5 and took that as wind size, h.f1<-find_dataleaks(lstx = lst,h=5,cutoff=1)f1$a .id start end2 b26$b .id start end1 a152 b17214 c1115$c .id start end1 a152 b17213 b2226$d .id start end5 e610$e .id start end4 d610Interpretation: The first element in the list means the last 5observations of the time seriesa correlates with timeseriesb observarion from 2 to 6.
viz_dataleaks:Visualise the data leaksviz_dataleaks(f1)[[1]]
[[2]][[2]]$a .id start end2 b 2 6[[2]]$b .id start end1 a 1 52 b 17 214 c 11 15[[2]]$c .id start end1 a 1 52 b 17 213 b 22 26[[2]]$d .id start end5 e 6 10[[2]]$e .id start end4 d 6 10reason_dataleaksDisplay the reasons for data leaks and evaluate usefulness of dataleaks towards the winning of the competition
r1<-reason_dataleaks(lstx = lst,finddataleaksout = f1,h=5)r1[[1]] series1 .id start end dist_mean dist_sd is.useful.leak dist_cor1 a b26-8.00.0 useful12 b a150.00.0 useful13 b b17210.00.0 useful14 b c1115-1.72.6 not useful-15 c a151.72.6 useful-16 c b17211.72.6 useful-17 c b22261.72.6 not useful-18 d e6100.00.0 not useful19 e d6100.00.0 not useful1 reason1 add constant2 exact match3 exact match4 multiply by-1 or negative constant value5 multiply by-1 or negative constant value6 multiply by-1 or negative constant value7 multiply by-1 or negative constant value8 exact match9 exact match[[2]]
a=rnorm(15)lst<-list( a,c(a[10:15],rnorm(10), a[1:5], a[1:5]),c(rnorm(10), a[1:5]))f1<-find_dataleaks(lst,h=5)viz_dataleaks(f1)#> [[1]]
#> #> [[2]]#> [[2]]$`1`#> .id start end#> 2 2 2 6#> #> [[2]]$`2`#> .id start end#> 1 1 1 5#> 2 2 17 21#> 4 3 11 15#> #> [[2]]$`3`#> .id start end#> 1 1 1 5#> 2 2 17 21#> 3 2 22 26reason_dataleaks(lst, f1,h=5)#> [[1]]#> series1 .id start end dist_mean dist_sd is.useful.leak dist_cor reason#> 1 1 2 2 6 0 0 useful 1 exact match#> 2 2 1 1 5 0 0 useful 1 exact match#> 3 2 2 17 21 0 0 useful 1 exact match#> 4 2 3 11 15 0 0 not useful 1 exact match#> 5 3 1 1 5 0 0 useful 1 exact match#> 6 3 2 17 21 0 0 useful 1 exact match#> 7 3 2 22 26 0 0 not useful 1 exact match#>#> [[2]]
library(Mcomp)data("M1")M1Y<-subset(M1,"yearly")M1Y_x<-lapply(M1Y,function(temp){temp$x})m1y_f1<-find_dataleaks(M1Y_x,h=6,cutoff =1)m1y_f1#> $YAF17#> .id start end#> 22 YAM6 9 14#>#> $YAM6#> .id start end#> 16 YAF17 16 21#>#> $YAM28#> .id start end#> 78 YAI21 16 21#>#> $YAB3#> .id start end#> 18 YAM2 14 19#>#> $YAB4#> .id start end#> 17 YAM1 15 20#>#> $YAI21#> .id start end#> 43 YAM28 16 21#>#> $YAG29#> .id start end#> 137 YAC15 6 11viz_dataleaks(m1y_f1)#> [[1]]
#> #> [[2]]#> [[2]]$YAF17#> .id start end#> 22 YAM6 9 14#> #> [[2]]$YAM6#> .id start end#> 16 YAF17 16 21#> #> [[2]]$YAM28#> .id start end#> 78 YAI21 16 21#> #> [[2]]$YAB3#> .id start end#> 18 YAM2 14 19#> #> [[2]]$YAB4#> .id start end#> 17 YAM1 15 20#> #> [[2]]$YAI21#> .id start end#> 43 YAM28 16 21#> #> [[2]]$YAG29#> .id start end#> 137 YAC15 6 11reason_dataleaks(M1Y_x, m1y_f1,h=6,ang=90)#> [[1]]#> series1 .id start end dist_mean dist_sd is.useful.leak dist_cor#> 1 YAF17 YAM6 9 14 5.4 0.4 not useful 1#> 2 YAM6 YAF17 16 21 -5.4 0.4 not useful 1#> 3 YAM28 YAI21 16 21 0.0 0.0 not useful 1#> 4 YAB3 YAM2 14 19 0.0 0.0 useful 1#> 5 YAB4 YAM1 15 20 0.0 0.0 useful 1#> 6 YAI21 YAM28 16 21 0.0 0.0 not useful 1#> 7 YAG29 YAC15 6 11 -36815.7 6159.2 useful 1#> reason#> 1 other transformation#> 2 other transformation#> 3 exact match#> 4 exact match#> 5 exact match#> 6 exact match#> 7 other transformation#>#> [[2]]