R - Error using sapply to remove constant columns in matrix - r

I'm trying to remove all columns that are constant in a matrix, but am receiving this error:
Error in X[, sapply(X, function(x) length(unique(x)) != 1)] :
(subscript) logical subscript too long
I'm not entirely sure why this error is popping up
Example
X <- structure(c(143.3, 152.37, 138.74, 149.87, 103.21, 130.98, 151.21,
103.34, 126.5, 86.87, 561.24, 633.21, 529.73, 621.18, 319.53,
476.16, 620.08, 279.21, 416.97, 184.58, 25.97, 30.05, 17.14,
37.7, 9.7, 15.9, 24.95, -1.84, 7.5, -9.95, 4.74, 14.32, 4.39,
5.1, 5.46, 4.87, 7.21, 4.31, 3.77, 4.32, 22.47, 205.1, 19.29,
25.96, 29.8, 23.74, 52.04, 18.6, 14.18, 18.66, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), .Dim = c(10L, 8L), .Dimnames = list(c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"), c("dday0_10", "dday10_30",
"dday30C", "prec", "prec_sq", "(Intercept)", "statear", "statede"
)))
X[,sapply(X,function(x) length(unique(x))!=1)]
> Error in X[, sapply(X, function(x) length(unique(x)) != 1)] :
(subscript) logical subscript too long
I'd like solutions which keep the data in a matrix format.

If you need to keep your data in matrix format, then try this:
X[,apply(X,2,function(x) length(unique(x))!=1)]
Output:
dday0_10 dday10_30 dday30C prec prec_sq
1 143.30 561.24 25.97 4.74 22.47
2 152.37 633.21 30.05 14.32 205.10
3 138.74 529.73 17.14 4.39 19.29
4 149.87 621.18 37.70 5.10 25.96
5 103.21 319.53 9.70 5.46 29.80
6 130.98 476.16 15.90 4.87 23.74
7 151.21 620.08 24.95 7.21 52.04
8 103.34 279.21 -1.84 4.31 18.60
9 126.50 416.97 7.50 3.77 14.18
10 86.87 184.58 -9.95 4.32 18.66

Related

R insert blank row after every regular spacing (formatting for transferring to excel-style readability)

Here is my dataframe:
structure(list(Dispensary = c("A", "A", "A", "A", "A", "A", "A",
"A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C",
"C", "C", "C", "C", "C", "C", "C"), cohort = c(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L), t0 = c(100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 100, 100, 100, 100, 100), t1 = c(46.1, 41.7,
37.5, 36.2, 35.9, 34.4, 39.8, 38.3, 0, 34.6, 37.4, 31.4, 29.5,
25.5, 33.2, 30.6, 30.8, 0, 28.6, 30.2, 28.1, 28.8, 30.7, 29.2,
33.5, 30.3, 0), t2 = c(41.4, 34.6, 38.6, 27.9, 30, 32.8, 35.2,
0, 0, 35.2, 31.4, 23.9, 24.2, 23.1, 30.1, 24, 0, 0, 26.9, 24.8,
21, 25.3, 25.8, 25.9, 23.3, 0, 0), t3 = c(29.6, 32.3, 31.7, 25.8,
29.5, 23.8, 0, 0, 0, 25.2, 28.9, 23, 23.8, 20.5, 22.6, 0, 0,
0, 20.7, 24, 21.5, 24.9, 23, 23.7, 0, 0, 0), t4 = c(30.9, 28,
30.4, 24.9, 29.5, 0, 0, 0, 0, 23.1, 23.1, 20.7, 21.8, 19.2, 0,
0, 0, 0, 21, 20.8, 21.3, 23.7, 20.5, 0, 0, 0, 0), t5 = c(30.3,
25.6, 23.5, 24.9, 0, 0, 0, 0, 0, 19.4, 22.1, 20, 19.5, 0, 0,
0, 0, 0, 17.2, 18.5, 20.4, 14.7, 0, 0, 0, 0, 0), t6 = c(30.9,
24, 23.2, 0, 0, 0, 0, 0, 0, 19.6, 20.3, 17.2, 0, 0, 0, 0, 0,
0, 20.3, 17.7, 18.2, 0, 0, 0, 0, 0, 0), t7 = c(27.6, 18.5, 0,
0, 0, 0, 0, 0, 0, 18, 16.1, 0, 0, 0, 0, 0, 0, 0, 14.2, 14.8,
0, 0, 0, 0, 0, 0, 0), t8 = c(17.8, 0, 0, 0, 0, 0, 0, 0, 0, 17.2,
0, 0, 0, 0, 0, 0, 0, 0, 11.4, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-27L), class = c("tbl_df", "tbl", "data.frame"))
I'm trying to insert a blank row after every 9th row, in between where cohort == 9 and cohort == 1
The dataframe is long and I'm hoping to not have to do it one a time
Here is a link to a similar question, but I'm looking for an R method, not a VBA method
Here is an option:
library(tidyverse)
df |>
group_split(Dispensary) |>
map_dfr(~add_row(.x))
#> # A tibble: 30 x 11
#> Dispensary cohort t0 t1 t2 t3 t4 t5 t6 t7 t8
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 100 46.1 41.4 29.6 30.9 30.3 30.9 27.6 17.8
#> 2 A 2 100 41.7 34.6 32.3 28 25.6 24 18.5 0
#> 3 A 3 100 37.5 38.6 31.7 30.4 23.5 23.2 0 0
#> 4 A 4 100 36.2 27.9 25.8 24.9 24.9 0 0 0
#> 5 A 5 100 35.9 30 29.5 29.5 0 0 0 0
#> 6 A 6 100 34.4 32.8 23.8 0 0 0 0 0
#> 7 A 7 100 39.8 35.2 0 0 0 0 0 0
#> 8 A 8 100 38.3 0 0 0 0 0 0 0
#> 9 A 9 100 0 0 0 0 0 0 0 0
#> 10 <NA> NA NA NA NA NA NA NA NA NA NA
#> # ... with 20 more rows
Assuming you are trying to add a row after every Dispensary you could do:
library(dplyr)
library(purrr)
df %>%
group_split(Dispensary) %>%
map_dfr(add_row)
If you do actually need to insert after every 9th row you can do:
df %>%
group_split(group = rep(1:(nrow(.) / 9), each = 9)) %>%
map_dfr(add_row) %>%
select(-group)

flag variable is incorrectly assigned during data restructuring in R

this data with which i work.
datatrain=structure(list(probeg = c(10000L, 20000L, 30000L, 40000L, 50000L,
60000L, 70000L, 80000L, 90000L, 100000L, 110000L, 120000L, 130000L,
140000L, 150000L, 160000L, 170000L, 180000L, 190000L, 200000L,
210000L, 220000L, 230000L, 240000L, 250000L, 260000L, 270000L,
280000L, 290000L, 300000L, 310000L, 320000L, 330000L, 340000L,
350000L, 360000L, 370000L, 380000L, 390000L, 400000L, 410000L,
420000L, 430000L, 440000L, 450000L, 460000L, 470000L, 480000L,
490000L, 500000L, 510000L, 520000L, 530000L, 540000L, 550000L,
560000L, 570000L, 580000L, 590000L, 600000L, 610000L, 620000L,
630000L, 640000L, 650000L, 660000L, 670000L, 680000L), EP_OBJECTID = c(88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 88679804L,
88679804L, 88679804L, 88679804L, 88679804L, 88679804L, 9000L,
9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L,
9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L, 9000L,
9000L), TU17L4 = c(80, 80.5, 80.5, 80.5, 80.5, 80.5, 80.5, 80.5,
80, 80, 80, 80, 79.5, 79.5, 79.5, 79.5, 79.5, 79.5, 73, 73, 73,
73, 73, 72, 72, 72, 70.5, 70.5, 70.5, 70.5, 70.5, 70, 70.5, 67,
67, 67, 67, 61.5, 61.5, 61.5, 61.5, 61.5, 61.5, 61.5, 61.5, 61.5,
61.5, 57.5, 56, 57.5, 57.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5,
56.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5, 56.5
), DELTTBL = c(12.5, 12.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 11.5, 1, 1, 1, 0, 1.5, 1.5, 0, 0, 0, 0, 0.5,
0.5, 0, 0, 0, 0, 0, 0, 0, 11.5, 0, 0, 0, 0, 4, 4, 4, 1.5, 4,
1.5, 1.5, 1.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
TU17R4 = c(80, 79.5, 79.5, 79.5, 79.5, 79, 78.5, 78.5, 78.5,
78.5, 78.5, 78, 78, 78, 78, 78, 78, 78, 72, 72, 72, 72, 72,
71, 71, 71, 69.5, 69.5, 69.5, 69.5, 69.5, 69.5, 69.5, 66,
66, 66, 66, 61, 61, 61, 61, 60.5, 60.5, 60.5, 60.5, 60.5,
60.5, 57, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57,
57, 57, 57, 56.5, 56.5, 56.5, 56.5, 56.5), DELTTBR = c(12.5,
12.5, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0.5, 0.5, 0, 0, 0, 0,
0, 0, 0, 0, 11, 1, 1, 1, 0, 1.5, 1.5, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 11, 0.5, 0.5, 0.5, 0, 3.5, 3.5, 3.5, 1,
3.5, 1, 1, 1, 0, 0, 0, 0, 0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
0.5, 0.5, 0.5, 0.5), TU17L5 = c(1060L, 1054L, 1054L, 1054L,
1054L, 1054L, 1054L, 1054L, 1053L, 1053L, 1053L, 1053L, 1052L,
1052L, 1052L, 1052L, 1052L, 1052L, 1038L, 1038L, 1038L, 1038L,
1038L, 1036L, 1036L, 1036L, 1033L, 1033L, 1033L, 1041L, 1033L,
1032L, 1033L, 1026L, 1026L, 1026L, 1026L, 1017L, 1017L, 1017L,
1017L, 1017L, 1017L, 1017L, 1017L, 1017L, 1017L, 1009L, 1028L,
1009L, 1009L, 1007L, 1007L, 1007L, 1014L, 1007L, 1007L, 1007L,
1014L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L, 1007L,
1007L), DELTDML = c(33L, 33L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 21L, 2L, 2L,
2L, 0L, 3L, 3L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 21L, 0L, 0L, 0L, 0L, 8L, 8L, 8L, 21L, 8L, 21L, 21L,
21L, 0L, 7L, 7L, 7L, 7L, 7L, 0L, 0L, 0L, 0L, 0L, 0L, 7L,
7L, 7L), TU17R5 = c(1060L, 1054L, 1054L, 1054L, 1054L, 1053L,
1052L, 1052L, 1052L, 1052L, 1052L, 1051L, 1051L, 1051L, 1051L,
1051L, 1051L, 1051L, 1038L, 1038L, 1038L, 1038L, 1038L, 1036L,
1036L, 1036L, 1033L, 1033L, 1033L, 1040L, 1033L, 1033L, 1033L,
1026L, 1026L, 1026L, 1026L, 1017L, 1017L, 1017L, 1017L, 1016L,
1016L, 1016L, 1016L, 1016L, 1016L, 1009L, 1028L, 1009L, 1009L,
1009L, 1009L, 1009L, 1014L, 1009L, 1009L, 1009L, 1014L, 1009L,
1009L, 1009L, 1009L, 1008L, 1008L, 1008L, 1008L, 1008L),
DELTDMR = c(31L, 31L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 21L, 2L, 2L, 2L, 0L,
3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
21L, 1L, 1L, 1L, 0L, 7L, 7L, 7L, 19L, 7L, 19L, 19L, 19L,
0L, 5L, 5L, 5L, 5L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L, 6L
), TU17L2 = c(27, 27, 27, 30, 26.5, 26.5, 26, 26, 26.5, 26,
25, 27, 27, 26.5, 26.5, 26.5, 26, 30, 27, 26, 26.5, 26.5,
26.5, 26.5, 26, 31, 31, 30, 30, 30, 29, 30, 30, 29, 30, 28,
28.5, 28.5, 28.5, 28.5, 28.5, 28, 30, 30, 30, 30, 28.5, 29,
28, 27, 27, 27, 26.5, 28, 28, 28, 28.5, 27, 28.5, 27, 27,
27, 27, 27, 26, 27, 26, 26), DELTTGRL = c(0, 0, 3.5, 3.5,
3.5, 3.5, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 4, 4, 4, 0.5, 0,
0.5, 0, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 0, 2, 2, 0, 0, 0, 0,
0, 2, 2, 2, 2, 1.5, 1.5, 0, 2, 2, 0.5, 0.5, 0.5, 0, 0, 0.5,
0.5, 1.5, 1.5, 0, 0, 0, 0, 1, 1, 1, 1, 2), TU17R2 = c(29,
28.5, 28.5, 30, 28, 28, 28, 28, 28, 28, 27, 28, 28, 27, 27,
27, 27.5, 30, 28, 27, 26, 27, 26, 25, 27, 30, 30, 30, 30,
30, 29, 30, 30, 30, 30, 29.5, 30.5, 30.5, 30.5, 30.5, 30.5,
29.5, 28, 28, 28, 28, 26.5, 29, 28, 28, 28, 28, 27, 28, 28,
28, 27.5, 28, 27.5, 27, 27, 26, 26, 26, 26.5, 26, 26, 26),
DELTTGRR = c(0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0, 2.5, 2.5, 2.5, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 1, 1,
1, 0, 0.5, 0.5, 0, 0, 0, 0, 0, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5,
0, 2, 2, 1, 1, 1, 0, 0, 0.5, 0.5, 0.5, 0.5, 0, 1, 1, 1, 0.5,
0.5, 0, 0, 2), TU17L3 = c(8, 8, 8, 10, 8, 8, 8, 8, 7, 7,
7, 10, 10, 9, 9, 9, 9, 10, 7.5, 9, 8.5, 9, 8, 8, 8, 10, 10,
10, 10, 10, 10, 10, 10, 10.5, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 8, 10, 10, 8, 9, 9, 8, 9, 8, 8, 8, 8.5, 8,
8.5, 8, 8, 7, 7, 7, 8, 7, 7, 7), DELTCRGRL = c(0, 0, 2, 2,
2, 2, 0, 1, 0, 1, 3, 3, 3, 3, 3, 0, 2.5, 2.5, 2.5, 0, 0,
0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 1, 1, 1, 0, 0, 0.5, 0.5, 0.5,
0.5, 0, 1, 1, 1, 1, 1, 0, 0, 2), TU17R3 = c(8, 8, 8, 10,
8, 8, 8, 8, 8, 8, 7.5, 10, 10, 9, 9, 9, 9, 10, 9, 9, 7, 9,
7, 7, 8, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 8, 9, 10, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 7, 7, 7, 8, 7, 7, 7), DELTCRGRR = c(0,
0, 2, 2, 2, 2, 0, 0, 0, 0, 2.5, 2.5, 2.5, 2.5, 2.5, 0, 1,
1, 1, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1)), class = "data.frame", row.names = c(NA,
-68L))
i need work only with data .
TU17L4;TU17R4;TU17L5;TU17R5;TU17L2;TU17R2;TU17L3;TU17R3
If the values of at least 3 of these variables simultaneously changed by at least 2.5, then flag should be set =1.And if flag =1, then the next count in probeg variable should start with the range of 10000 km.
For example rows 19-20 for variable TU17L4 changed 79,5-73=6,5,TU17R4 78-72=6 and TU17L3 10-7,5=2.5
so after this rows count of probeg variable must be from 10000
example
probeg EP_OBJECTID TU17L4 DELTTBL TU17R4 DELTTBR TU17L5 DELTDML
1 10000 88679804 80.0 12.5 80.0 12.5 1060 33
2 20000 88679804 80.5 12.5 79.5 12.5 1054 33
3 30000 88679804 80.5 0.0 79.5 0.0 1054 0
4 40000 88679804 80.5 0.0 79.5 0.0 1054 0
5 50000 88679804 80.5 0.0 79.5 0.0 1054 0
6 60000 88679804 80.5 0.0 79.0 0.5 1054 0
7 70000 88679804 80.5 0.0 78.5 0.5 1054 0
8 80000 88679804 80.5 0.0 78.5 0.0 1054 0
9 90000 88679804 80.0 0.0 78.5 0.0 1053 0
10 100000 88679804 80.0 0.0 78.5 0.5 1053 0
11 110000 88679804 80.0 0.0 78.5 0.5 1053 0
12 120000 88679804 80.0 0.0 78.0 0.5 1053 0
13 130000 88679804 79.5 0.0 78.0 0.0 1052 0
14 140000 88679804 79.5 0.0 78.0 0.0 1052 0
15 150000 88679804 79.5 0.0 78.0 0.0 1052 0
16 160000 88679804 79.5 0.0 78.0 0.0 1052 0
17 170000 88679804 79.5 0.0 78.0 0.0 1052 0
18 180000 88679804 79.5 0.0 78.0 0.0 1052 0
19 190000 88679804 73.0 0.0 72.0 0.0 1038 0
**20 10000 88679804 73.0 0.0 72.0 0.0 1038 0
21 20000 88679804 73.0 11.5 72.0 11.0 1038 21
22 30000 88679804 73.0 1.0 72.0 1.0 1038 2**
TU17R5 DELTDMR TU17L2 DELTTGRL TU17R2 DELTTGRR TU17L3 DELTCRGRL
1 1060 31 27.0 0.0 29.0 0.0 8.0 0.0
2 1054 31 27.0 0.0 28.5 0.0 8.0 0.0
3 1054 0 27.0 3.5 28.5 2.0 8.0 2.0
4 1054 0 30.0 3.5 30.0 2.0 10.0 2.0
5 1054 0 26.5 3.5 28.0 2.0 8.0 2.0
6 1053 1 26.5 3.5 28.0 2.0 8.0 2.0
7 1052 1 26.0 0.0 28.0 0.0 8.0 0.0
8 1052 0 26.0 0.0 28.0 0.0 8.0 1.0
9 1052 0 26.5 0.0 28.0 0.0 7.0 0.0
10 1052 1 26.0 0.0 28.0 0.0 7.0 1.0
11 1052 1 25.0 2.0 27.0 1.0 7.0 3.0
12 1051 1 27.0 2.0 28.0 1.0 10.0 3.0
13 1051 0 27.0 2.0 28.0 1.0 10.0 3.0
14 1051 0 26.5 2.0 27.0 1.0 9.0 3.0
15 1051 0 26.5 2.0 27.0 1.0 9.0 3.0
16 1051 0 26.5 0.0 27.0 0.0 9.0 0.0
17 1051 0 26.0 4.0 27.5 2.5 9.0 2.5
18 1051 0 30.0 4.0 30.0 2.5 10.0 2.5
19 1038 0 27.0 4.0 28.0 2.5 7.5 2.5
20 1038 0 26.0 0.5 27.0 0.0 9.0 0.0
21 1038 21 26.5 0.0 26.0 0.0 8.5 0.0
22 1038 2 26.5 0.5 27.0 0.0 9.0 0.0
TU17R3 DELTCRGRR
1 8.0 0.0
2 8.0 0.0
3 8.0 2.0
4 10.0 2.0
5 8.0 2.0
6 8.0 2.0
7 8.0 0.0
8 8.0 0.0
9 8.0 0.0
10 8.0 0.0
11 7.5 2.5
12 10.0 2.5
13 10.0 2.5
14 9.0 2.5
15 9.0 2.5
16 9.0 0.0
17 9.0 1.0
18 10.0 1.0
19 9.0 1.0
20 9.0 0.0
21 7.0 0.0
22 9.0 0.0
and this must for each EP_OBJECTID category separately. So i do so
the first i run this part of code
library(dplyr)
datatrain %>%
filter(!(EP_OBJECTID != lag(EP_OBJECTID) & DELTDMR == lag(DELTDMR))) %>%
group_by(EP_OBJECTID) %>%
mutate(DELT = seq(10000, length.out = n(), by = 10000))
and then i run second step, this code
threshold <- 3
flags <- dt %>%
apply(., 2, diff) %>%
apply(., 1,
function(x)
ifelse(length(x[abs(x) > threshold]) > 1,
1,
0))
dt$flag <- c(0, flags)
dt
but the result is wrong . I.E. not as i provided above.
I see this result
X probeg EP_OBJECTID TU17L4 DELTTBL TU17R4 DELTTBR TU17L5
1 1 20000 88679804 80.5 12.5 79.5 12.5 1054
2 2 30000 88679804 80.5 0.0 79.5 0.0 1054
3 3 40000 88679804 80.5 0.0 79.5 0.0 1054
4 4 50000 88679804 80.5 0.0 79.5 0.0 1054
5 5 60000 88679804 80.5 0.0 79.0 0.5 1054
6 6 70000 88679804 80.5 0.0 78.5 0.5 1054
7 7 80000 88679804 80.5 0.0 78.5 0.0 1054
8 8 90000 88679804 80.0 0.0 78.5 0.0 1053
9 9 100000 88679804 80.0 0.0 78.5 0.5 1053
10 10 110000 88679804 80.0 0.0 78.5 0.5 1053
DELTDML TU17R5 DELTDMR TU17L2 DELTTGRL TU17R2 DELTTGRR TU17L3
1 33 1054 31 27.0 0.0 28.5 0 8
2 0 1054 0 27.0 3.5 28.5 2 8
3 0 1054 0 30.0 3.5 30.0 2 10
4 0 1054 0 26.5 3.5 28.0 2 8
5 0 1053 1 26.5 3.5 28.0 2 8
6 0 1052 1 26.0 0.0 28.0 0 8
7 0 1052 0 26.0 0.0 28.0 0 8
8 0 1052 0 26.5 0.0 28.0 0 7
9 0 1052 1 26.0 0.0 28.0 0 7
10 0 1052 1 25.0 2.0 27.0 1 7
DELTCRGRL TU17R3 DELTCRGRR DELT flag
1 0 8.0 0.0 1e+04 0
2 2 8.0 2.0 2e+04 1
3 2 10.0 2.0 3e+04 1
4 2 8.0 2.0 4e+04 1
5 2 8.0 2.0 5e+04 1
6 0 8.0 0.0 6e+04 1
7 1 8.0 0.0 7e+04 1
8 0 8.0 0.0 8e+04 1
9 1 8.0 0.0 9e+04 1
10 3 7.5 2.5 1e+05 1
it is wrong. How can i get the result like i provided above.
I think this data.table approach should work.. pleze let me know..
library( data.table )
setDT( datatrain )
#first, we split by EP_OBJECTID
# loop over the split list,
L <- lapply( split( datatrain, by = "EP_OBJECTID" ), function(dt) {
# find rows where the diff of TU columns is >=2.5 in 3 columns or more
dt[ shift( rowSums( dt[, lapply(.SD, function(x) abs( x - shift(x, type = "lag") ) ),
.SDcols = patterns("^TU") ] >= 2.5 ) >=3, type = "lag" ),
probeg := 10000 ]
#first row of a group is always 10000
dt[ 1, probeg := 10000 ]
#set new value of probeg for all rows
dt[, probeg := seq_len(.N) * 10000, by = .(cumsum( probeg == 10000 ) ) ]
return(dt)
})
#bind the split list back together to a single data.table
ans <- rbindlist( L, use.names = TRUE )

Function that checks multiple combinations of operations between columns of a dataframe

Here is a sample of the original dataframe (data provided at the end)
> DATA
N_b N_l A x.sqr_sum e_1 e_2 e_3 e_4 e_5 e_6 e_7 e_8
1 7 6 -36 4032 33.8 21.8 9.8 -2.2 -14.2 -26.2 0 0
2 7 4 -27 2268 23.6 11.6 -0.4 -12.4 0.0 0.0 0 0
3 7 4 -27 2268 23.6 11.6 -0.4 -12.4 0.0 0.0 0 0
4 7 6 -36 4032 33.8 21.8 9.8 -2.2 -14.2 -26.2 0 0
5 7 6 -36 4032 33.8 21.8 9.8 -2.2 -14.2 -26.2 0 0
6 7 6 -36 4032 33.8 21.8 9.8 -2.2 -14.2 -26.2 0 0
7 7 8 -45 6300 44.0 32.0 20.0 8.0 -4.0 -16.0 -28 -40
8 7 8 -45 6300 44.0 32.0 20.0 8.0 -4.0 -16.0 -28 -40
9 7 8 -45 6300 44.0 32.0 20.0 8.0 -4.0 -16.0 -28 -40
I want to write a function to calculate R from the equation
I write the code below to calculate R and the N_l responsible for the maximum R.
R <- function(x){
N_b <- x[1]
N_l <- x[2]
N_l_seq <- seq(N_l)
A <- x[3]
x.sqr_sum <- x[4]
e <- x[5:12]
m <- Multi.Presence$m[N_l_seq]
f <- m * (N_l_seq/N_b + A * cumsum(e) / x.sqr_sum)
c(val = max(f), pos = which.max(f))
}
DATA <- cbind(DATA, vars = t(apply(DATA, 1, R)))
In the function above, R is calculated for all possible values of N_l by defining N_l_seq <- seq(N_l). The problem is I don't want to just multiply by cumsum(e) as written in the function. I want to modify it so that it would calculate R for all possible combinations for the same number of e_1, e_2, e_3,... as the current value of N_l.
Example
If N_l = 3, the equation for f is calculated for the cumsum of all possible combinations of 3 of the e_1, e_2, e_3, e_4, e_5, e_6, e_7, e_8, such as cumsum(e_1, e_8, e_6) and cumsum(e_7, e_2, e_4). When N_l = 5, the equation for f is calculated for the cumsum of all possible combinations of 5 of the e_1, e_2, e_3, e_4, e_5, e_6, e_7, e_8, and so on.
PROBLEM
I am not sure how to update the f equation so instead of the cumsum() of all the possible e values, it calculates the cumsum() of all combinations of a number equal to the current N_l of the e values.
DATA
> dput(DATA)
structure(list(N_b = c(7, 7, 7, 7, 7, 7, 7, 7, 7), N_l = c(6,
4, 4, 6, 6, 6, 8, 8, 8), A = c(-36, -27, -27, -36, -36, -36,
-45, -45, -45), x.sqr_sum = c(4032, 2268, 2268, 4032, 4032, 4032,
6300, 6300, 6300), e_1 = c(33.8, 23.6, 23.6, 33.8, 33.8, 33.8,
44, 44, 44), e_2 = c(21.8, 11.6, 11.6, 21.8, 21.8, 21.8, 32,
32, 32), e_3 = c(9.8, -0.399, -0.399, 9.8, 9.8, 9.8, 20, 20, 20),
e_4 = c(-2.2, -12.4, -12.4, -2.2, -2.2, -2.2, 8, 8, 8), e_5 =
c(-14.2, 0, 0, -14.2, -14.2, -14.2, -4, -4, -4), e_6 = c(-26.2,
0, 0, -26.2, -26.2, -26.2, -16, -16, -16), e_7 = c(0, 0, 0, 0,
0, 0, -28, -28, -28), e_8 = c(0, 0, 0, 0, 0, 0, -40, -40, -40),
S = c(12, 9, 9, 12, 12, 12, 15, 15, 15)), row.names = c(1L, 3L,
4L, 115L, 116L, 117L, 199L, 200L, 201L), class = "data.frame")
A dependent variable m is defined in the dataframe below:
> dput(Multi.Presence)
structure(list(N_l = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), m = c(1.2,
1, 0.85, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65)), row.names = c(NA,
-10L), class = "data.frame")
I am not sure if this is what you want. I guess you should use combn rather than cumsum in your function R.
As the first step, I merged Multi.Presence to DATA so you can read the corresponding value of m with respect to N_l
df <- merge(DATA, Multi.Presence, by = "N_l")
Then, I rewrote function R such that it accepts the rows of df as the argument
R <- function(x){
N_l <- x["N_l"]
N_b <- x["N_b"]
N_l_seq <- seq(N_l)
A <- x["X_ext"]
x.sqr_sum <- x["x.sqr_sum"]
e <- x[grepl("e_\\d",names(x))]
m <- x["m"]
f <- m * (N_l/N_b + A * combn(e,N_l,sum) / x.sqr_sum)
c(val = max(f), pos = which.max(f))
}
Finally, you can execute function R within apply by rows, e.g.,
> apply(df,1,R)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
val 0.4704685 0.4704685 0.7475 0.7475 0.7475 0.7475 0.6685714 0.6685714
pos 56.0000000 56.0000000 28.0000 28.0000 28.0000 28.0000 1.0000000 1.0000000
[,9]
val 0.6685714
pos 1.0000000
Update
I have no clue how you want to deal with the combn, but below is an update
R <- function(x){
# browser()
N_l <- x["N_l"]
N_b <- x["N_b"]
N_l_seq <- seq(N_l)
A <- x["A"]
x.sqr_sum <- x["x.sqr_sum"]
e <- x[grepl("e_\\d",names(x))]
m <- Multi.Presence$m[N_l_seq]
f <- m * sapply(N_l_seq,function(k) N_l/N_b + A * max(combn(e,k,sum)) / x.sqr_sum)
c(val = max(f), pos = which.max(f))
}

How to force Hmisc package in R to round to 3 decimals?

I have a correlation matrix produced using the Hmisc package. It produces the correlations to 2 decimal places, however I would like it to show 3 decimals places. How do I force it to do so? I am using the rcorr function.
Data:
df <- structure(list(X1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
X2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), Y1 = c(3.93333333333333, 3.13333333333333, 4.3, 4.13333333333333,
3.2, 3.6, 3.66666666666667, 1.8, 3.8, 4.13333333333333, 4.13333333333333,
1.6, 3.4, 3.26666666666667, 2.53333333333333, 4.06666666666667,
4.53333333333333, 4.13333333333333, 3.4, 3.8, 3.33333333333333,
3.86666666666667, 4, 4.2, 2.53333333333333, 1.73333333333333,
1.8, 2.73333333333333, 1.66666666666667, 1.33333333333333,
2, 2.4, 3, 3.26666666666667, 3.2, 3.53333333333333, 3.66666666666667,
2.8, 3.33333333333333, 3.06666666666667, 3.46666666666667,
3.13333333333333, 3.93333333333333, 2.46666666666667, 1.26666666666667,
4.13333333333333, 1.8, 3, 2.93333333333333, 1.53333333333333,
4.06666666666667, 3.6, 2.06666666666667, 4.13333333333333,
3.3, 3.53333333333333, 3.4, 3.93333333333333, 3.73333333333333,
3, 3.13333333333333, 2.2, 4, 5, 3.66666666666667, 3.2, 3.4,
3.8, 3.66666666666667, 4.3, 4.2, 4.46666666666667, 3.33333333333333,
4.4, 4.2), Y2 = c(3.6, 2.2, 3.5, 4.2, 3, 2.8, 5, 2, 4.8,
4.4, 4.6, 1.6, 3.8, 3, 3, 3.4, 3.8, 4.2, 3.4, 3.4, 3.4, 4,
4.8, 4, 2.8, 1, 1.4, 1.2, 1.6, 3.8, 2.2, 1.4, 3.2, 1, 3.4,
3.2, 3.4, 1.8, 3.2, 1, 3, 2.8, 2.4, 1, 1, 4, 1.8, 2, 1, 1.2,
4.4, 3.2, 2, 4.2, 3.2, 3.2, 3.2, 3.6, 2.2, 2.8, 3.4, 2.6,
3.8, 4.2, 2.8, 3, 3.2, 4.8, 4.8, 4, 5, 5, 4.2, 4.6, 4.5)), row.names = c(NA,
-75L), groups = structure(list(filter = c(0, 1, 2), .rows = list(
1:25, 26:50, 51:75)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
correlation.matrix <- as.matrix(df[, c(
"X1", "X2", "Y1", "Y2")])
cor.table <- Hmisc::rcorr(correlation.matrix)
cor.table
I think that you cannot (natively).
From the source code of Hmisc:::print.rcorr (not exported but used as an S3 method):
P <- ifelse(P < .0001, 0, P)
p <- format(round(P, 4))
The function hard-codes replacing low values with 0 as well as 4 digits in the print. I suggest you open an issue with a feature-request.
In the meantime, here's a replacement that allows you to control it a little:
print.rcorr <- function(x, ..., digits = getOption("Hmisc.rcorr.digits", 4))
{
print(round(x$r,2))
n <- x$n
if(all(n == n[1,1]))
cat("\nn=", n[1,1], "\n\n")
else {
cat("\nn\n")
print(n)
}
cat("\nP\n")
P <- x$P
P <- ifelse(P < .0001, 0, P)
p <- format(round(P, digits))
p[is.na(P)] <- ""
print(p, quote=FALSE)
invisible()
}
print(cor.table, digits=2)
# X1 X2 Y1 Y2
# X1 1.00 -0.50 0.31 0.34
# X2 -0.50 1.00 -0.50 -0.61
# Y1 0.31 -0.50 1.00 0.74
# Y2 0.34 -0.61 0.74 1.00
# n= 75
# P
# X1 X2 Y1 Y2
# X1 0.00 0.01 0.00
# X2 0.00 0.00 0.00
# Y1 0.01 0.00 0.00
# Y2 0.00 0.00 0.00
This "should" always mask Hmisc:::print.rcorr. I added the ability to use an options controller, so you can even do
options("Hmisc.rcorr.digits" = 2)
cor.table
# X1 X2 Y1 Y2
# X1 1.00 -0.50 0.31 0.34
# X2 -0.50 1.00 -0.50 -0.61
# Y1 0.31 -0.50 1.00 0.74
# Y2 0.34 -0.61 0.74 1.00
# n= 75
# P
# X1 X2 Y1 Y2
# X1 0.00 0.01 0.00
# X2 0.00 0.00 0.00
# Y1 0.01 0.00 0.00
# Y2 0.00 0.00 0.00
However, two words of caution:
though I didn't test well and I think it's robust enough, it might be possible for some method of loading packages to use Hmisc:::print.rcorr instead of your global-defined version ... I'll have to think a little about namespaces and search path to know if this can unintentionally happen; and
if/when Hmisc updates their function, you should probably update this function as well. While this is unlikely (the last update to that function was 6 years ago), it is certainly feasible.
Using the print function, you can specify the number of digits you need.
print( cor.table$r, digits=3)

Deleting all rows until certain value - than do the same for the next group

I have a dataset about the returns of stocks in the last 30 years. Now I need to delete all rows(years) for a company until the first row, which isn´t NA. But I need to leave all other rows with NA for that company, that may occur later. Then the code should jump to the next company(Id) and restart the process.
I already tried the following code, but to be honest I´m kind of lost.
cleaning <- function (DT, colnames){
for(cols in colnames)
if(is.na(cols)){
DT[, cols := NULL]
} else {
break
}
}
MergedDT[, cleaning(MergedDT, RET), by = "Id"]
I received the following warning for that code:
> 1: In `[.data.table`(DT, , `:=`(cols, NULL)) : Adding new column
> 'cols' then assigning NULL (deleting it).
Furthermore, I think that there is a way more efficient way to solve that problem.
A combination of group_by, to do the analysis per company (or per cyl in this example) and do to find the first instance in which years (or mpg) is not NA should work:
df <- structure(list(model = c("Datsun 710", "Merc 240D", "Merc 230",
"Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona",
"Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Volvo 142E", "Mazda RX4",
"Mazda RX4 Wag", "Hornet 4 Drive", "Valiant", "Merc 280", "Merc 280C",
"Ferrari Dino", "Hornet Sportabout", "Duster 360", "Merc 450SE",
"Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental",
"Chrysler Imperial", "Dodge Challenger", "AMC Javelin", "Camaro Z28",
"Pontiac Firebird", "Ford Pantera L", "Maserati Bora"), mpg = c(NA,
NA, NA, NA, NA, 33.9, 21.5, NA, 26, 30.4, 21.4, NA, NA, NA, 18.1,
19.2, 17.8, 19.7, NA, NA, NA, NA, 15.2, 10.4, 10.4, 14.7, 15.5,
15.2, 13.3, 19.2, 15.8, 15), cyl = c(4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8), disp = c(108, 146.7, 140.8, 78.7, 75.7, 71.1, 120.1,
79, 120.3, 95.1, 121, 160, 160, 258, 225, 167.6, 167.6, 145,
360, 360, 275.8, 275.8, 275.8, 472, 460, 440, 318, 304, 350,
400, 351, 301), hp = c(93, 62, 95, 66, 52, 65, 97, 66, 91, 113,
109, 110, 110, 110, 105, 123, 123, 175, 175, 245, 180, 180, 180,
205, 215, 230, 150, 150, 245, 175, 264, 335), drat = c(3.85,
3.69, 3.92, 4.08, 4.93, 4.22, 3.7, 4.08, 4.43, 3.77, 4.11, 3.9,
3.9, 3.08, 2.76, 3.92, 3.92, 3.62, 3.15, 3.21, 3.07, 3.07, 3.07,
2.93, 3, 3.23, 2.76, 3.15, 3.73, 3.08, 4.22, 3.54), wt = c(2.32,
3.19, 3.15, 2.2, 1.615, 1.835, 2.465, 1.935, 2.14, 1.513, 2.78,
2.62, 2.875, 3.215, 3.46, 3.44, 3.44, 2.77, 3.44, 3.57, 4.07,
3.73, 3.78, 5.25, 5.424, 5.345, 3.52, 3.435, 3.84, 3.845, 3.17,
3.57), qsec = c(18.61, 20, 22.9, 19.47, 18.52, 19.9, 20.01, 18.9,
16.7, 16.9, 18.6, 16.46, 17.02, 19.44, 20.22, 18.3, 18.9, 15.5,
17.02, 15.84, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 16.87, 17.3,
15.41, 17.05, 14.5, 14.6), vs = c(1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), am = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), gear = c(4,
4, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 5, 5), carb = c(1, 2, 2, 1, 2, 1, 1,
1, 2, 2, 2, 4, 4, 1, 1, 4, 4, 6, 2, 4, 3, 3, 3, 4, 4, 4, 2, 2,
4, 2, 4, 8)), row.names = c(NA, -32L), class = c("tbl_df", "tbl",
"data.frame"))
df %>%
group_by(cyl) %>%
do(
.[first(which(!is.na(.$mpg))):nrow(.),]
)
Iiuc, you are looking to trim beginning NA returns for each ID, here is an option:
DT[-DT[,.I[seq_len(match(TRUE, !is.na(RET)) - 1L)], .(ID)]$V1]
output:
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
data:
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))
DT:
ID RET
1: 1 NA
2: 1 0.02
3: 1 NA
4: 2 NA
5: 2 NA
6: 2 0.01
7: 2 NA
8: 3 0.01
9: 3 0.05
10: 3 0.02
DT[DT[, .I[cumsum(!is.na(RET)) > 0], ID]$V1]
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
Data (stolen from chinsoon12 (Original question poster failed to provide reproducible data)):
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))

Resources