
scimo provides extra recipes steps for dealing withomics data, while also being adaptable to other data types.
You can installscimo from GitHub with:
# install.packages("remotes")remotes::install_github("abichat/scimo")Thecheese_abundance dataset describes fungal communityabundance of 74 Amplicon Sequences Variants (ASVs) sampled from thesurface of three different French cheeses.
library(scimo)data("cheese_abundance","cheese_taxonomy")cheese_abundance#> # A tibble: 9 × 77#> sample cheese rind_type asv_01 asv_02 asv_03 asv_04 asv_05 asv_06 asv_07 asv_08#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>#> 1 sample… Saint… Natural 1 0 38 40 1 2 31 8#> 2 sample… Saint… Natural 3 4 38 61 4 4 48 14#> 3 sample… Saint… Natural 28 16 33 23 31 29 21 1#> 4 sample… Livar… Washed 0 2 1 0 5 1 0 0#> 5 sample… Livar… Washed 0 0 4 0 1 1 2 0#> 6 sample… Livar… Washed 0 1 2 0 2 1 0 0#> 7 sample… Epois… Washed 4 2 3 0 2 5 0 0#> 8 sample… Epois… Washed 0 0 0 0 0 0 0 0#> 9 sample… Epois… Washed 0 0 1 0 0 0 2 0#> # ℹ 66 more variables: asv_09 <dbl>, asv_10 <dbl>, asv_11 <dbl>, asv_12 <dbl>,#> # asv_13 <dbl>, asv_14 <dbl>, asv_15 <dbl>, asv_16 <dbl>, asv_17 <dbl>,#> # asv_18 <dbl>, asv_19 <dbl>, asv_20 <dbl>, asv_21 <dbl>, asv_22 <dbl>,#> # asv_23 <dbl>, asv_24 <dbl>, asv_25 <dbl>, asv_26 <dbl>, asv_27 <dbl>,#> # asv_28 <dbl>, asv_29 <dbl>, asv_30 <dbl>, asv_31 <dbl>, asv_32 <dbl>,#> # asv_33 <dbl>, asv_34 <dbl>, asv_35 <dbl>, asv_36 <dbl>, asv_37 <dbl>,#> # asv_38 <dbl>, asv_39 <dbl>, asv_40 <dbl>, asv_41 <dbl>, asv_42 <dbl>, …glimpse(cheese_taxonomy)#> Rows: 74#> Columns: 9#> $ asv <chr> "asv_01", "asv_02", "asv_03", "asv_04", "asv_05", "asv_06", "asv…#> $ lineage <chr> "k__Fungi|p__Ascomycota|c__Dothideomycetes|o__Dothideales|f__Dot…#> $ kingdom <chr> "Fungi", "Fungi", "Fungi", "Fungi", "Fungi", "Fungi", "Fungi", "…#> $ phylum <chr> "Ascomycota", "Ascomycota", "Ascomycota", "Ascomycota", "Ascomyc…#> $ class <chr> "Dothideomycetes", "Eurotiomycetes", "Eurotiomycetes", "Eurotiom…#> $ order <chr> "Dothideales", "Eurotiales", "Eurotiales", "Eurotiales", "Euroti…#> $ family <chr> "Dothioraceae", "Aspergillaceae", "Aspergillaceae", "Aspergillac…#> $ genus <chr> "Aureobasidium", "Aspergillus", "Penicillium", "Penicillium", "P…#> $ species <chr> "Aureobasidium Group pullulans", "Aspergillus fumigatus", "Penic…list_family<-split(cheese_taxonomy$asv, cheese_taxonomy$family)head(list_family,2)#> $Aspergillaceae#> [1] "asv_02" "asv_03" "asv_04" "asv_05" "asv_06" "asv_07" "asv_08" "asv_09"#>#> $Debaryomycetaceae#> [1] "asv_10" "asv_11" "asv_12" "asv_13" "asv_14" "asv_15" "asv_16" "asv_17"#> [9] "asv_18" "asv_19" "asv_20" "asv_21" "asv_22"The following recipe will
list_family;cheese.rec<-recipe(cheese~ .,data = cheese_abundance)%>%step_aggregate_list(all_numeric_predictors(),list_agg = list_family,fun_agg = sum )%>%step_rownormalize_tss(all_numeric_predictors())%>%step_select_kruskal(all_numeric_predictors(),outcome ="cheese",cutoff =0.05 )%>%prep()rec#>#> ── Recipe ────────────────────────────────────────────────────────────────────────#>#> ── Inputs#> Number of variables by role#> outcome: 1#> predictor: 76#>#> ── Training information#> Training data contained 9 data points and no incomplete rows.#>#> ── Operations#> • Aggregation of: asv_01, asv_02, asv_03, asv_04, asv_05, ... | Trained#> • TSS normalization on: Aspergillaceae Debaryomycetaceae, ... | Trained#> • Kruskal filtering against cheese on: Aspergillaceae, ... | Trainedbake(rec,new_data =NULL)#> # A tibble: 9 × 8#> sample rind_type cheese Debaryomycetaceae Dipodascaceae Saccharomycetaceae#> <fct> <fct> <fct> <dbl> <dbl> <dbl>#> 1 sample1-1 Natural Saint-Ne… 0.719 0.0684 0.113#> 2 sample1-2 Natural Saint-Ne… 0.715 0.0725 0.119#> 3 sample1-3 Natural Saint-Ne… 0.547 0.277 0.0938#> 4 sample2-1 Washed Livarot 0.153 0.845 0.000854#> 5 sample2-2 Washed Livarot 0.150 0.848 0.00106#> 6 sample2-3 Washed Livarot 0.160 0.837 0.00108#> 7 sample3-1 Washed Epoisses 0.0513 0.944 0.00327#> 8 sample3-2 Washed Epoisses 0.0558 0.941 0.00321#> 9 sample3-3 Washed Epoisses 0.0547 0.942 0.00329#> # ℹ 2 more variables: `Saccharomycetales fam Incertae sedis` <dbl>,#> # Trichosporonaceae <dbl>To see which variables are kept and the associated p-values, you canuse thetidy method on the third step:
tidy(rec,3)#> # A tibble: 13 × 4#> terms pv kept id#> <chr> <dbl> <lgl> <chr>#> 1 Aspergillaceae 0.0608 FALSE select_kruskal_WKayj#> 2 Debaryomycetaceae 0.0273 TRUE select_kruskal_WKayj#> 3 Dipodascaceae 0.0273 TRUE select_kruskal_WKayj#> 4 Dothioraceae 0.101 FALSE select_kruskal_WKayj#> 5 Lichtheimiaceae 0.276 FALSE select_kruskal_WKayj#> 6 Metschnikowiaceae 0.0509 FALSE select_kruskal_WKayj#> 7 Mucoraceae 0.0608 FALSE select_kruskal_WKayj#> 8 Phaffomycetaceae 0.0794 FALSE select_kruskal_WKayj#> 9 Saccharomycetaceae 0.0273 TRUE select_kruskal_WKayj#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE select_kruskal_WKayj#> 11 Trichomonascaceae 0.0625 FALSE select_kruskal_WKayj#> 12 Trichosporonaceae 0.0273 TRUE select_kruskal_WKayj#> 13 Wickerhamomyceteae 0.177 FALSE select_kruskal_WKayjLikecolino,scimo proposes 3 arguments for variable selection stepsbased on a statistic:n_kept,prop_kept andcutoff.
n_kept andprop_kept deal with how manyvariables will be kept in the preprocessed dataset, based on an exactcount of variables or a proportion relative to the original dataset.They are mutually exclusive.
cutoff removes variables whose statistic is below(or above, depending on the step) it. It could be used alone or inaddition to the two others.
scimo doesn’t introduce any additional dependenciescompared torecipes.
scimo is simply the reverse ofomics.