Below is random data.
drop drop1 drop2 ch
15 14 40 1
20 15 45 1
35 16 90 1
40 17 70 0
25 18 80 0
30 18 90 0
11 20 100 0
13 36 11 0
16 70 220 0
19 40 440 1
25 45 1 1
35 30 70 1
40 40 230 1
17 11 170 1
30 2 160 1
I am using code below for variable profiling for continuous variable in R.
library(dplyr)
dt %>% mutate(dec=ntile(drop, n=2)) %>%
count(ch, dec) %>%
filter(ch == 1) -> datcbld
datcbld$N <- unclass(dt %>%
mutate(dec=ntile(drop, n=2)) %>%
count(dec) %>%
unname())[[2]]
datcbld$ch_perc <- datcbld$n / datcbld$N
datcbld$GreaterThan <- unclass(dt %>% mutate(dec=ntile(drop, n=2)) %>%
group_by(dec) %>%
summarise(min(drop)))[[2]]
datcbld$LessThan <- unclass(dt %>%
mutate(dec=ntile(drop, n=2)) %>%
group_by(dec) %>%
summarise(max(drop)))[[2]]
datcbld$Varname <- rep("dt", nrow(datcbld))
And below is output of the code.
ch dec n N ch_perc GreaterThan LessThan Varname
1 1 4 8 0.5 11 25 drop
1 2 5 7 0.714285714 25 40 drop
This code works perfectly fine when I am using it for a single variable.
When I am trying to run it for each column using a for loop it is unable to summarise with min and max for each decile.
Below is my code using for running for loop.
finaldata <- data.frame()
for(i in 1:(ncol(dt) - 1)){
dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
count(ch,dec) %>%
filter(ch == 1) -> dat
dat$N <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
count(dec) %>%
unname())[[2]]
dat$ch_perc <- dat$n / dat$N
dat$GreaterThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(min(dt[, colnames(dt[i])])))[[2]]
dat$LessThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(max(dt[, colnames(dt[i])])))[[2]]
dat$Varname <- rep(colnames(dt[i]), nrow(dat))
finaldata <- rbind(finaldata, dat)
}
But I'm unable to get same result.
We could do this with map by looping over the names and this can be done without breaking off the chain (%>%)
library(tidyverse)
names(dt)[1:3] %>%
map_df(~
dt %>%
select(.x, ch) %>%
mutate(dec = ntile(!! rlang::sym(.x), n = 2)) %>%
group_by(dec) %>%
mutate(N = n(),
GreaterThan = max(!!rlang::sym(.x)),
LessThan = min(!!rlang::sym(.x))) %>%
select(-1) %>%
count(!!! rlang::syms(names(.))) %>%
filter(ch == 1)%>%
mutate(ch_perc = n/N,
Varname = .x))
# A tibble: 6 x 8
# Groups: dec [2]
# dec ch N GreaterThan LessThan n ch_perc Varname
# <int> <int> <int> <dbl> <dbl> <int> <dbl> <chr>
#1 1 1 8 25 11 4 0.5 drop
#2 2 1 7 40 25 5 0.714 drop
#3 1 1 8 18 2 5 0.625 drop1
#4 2 1 7 70 20 4 0.571 drop1
#5 1 1 8 90 1 5 0.625 drop2
#6 2 1 7 440 90 4 0.571 drop2
The issue in the OP's for loop is the use of
dt[, colnames(dt[i])]
within summarise. It will apply the min or max on the full column value instead of applying the function on the column respecting the group by structure
We could convert the column names to symbols as showed above (sym) and do an evaluation or use summarise_at
finaldata <- data.frame()
for(i in 1:(ncol(dt) - 1)){
dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
count(ch,dec) %>%
filter(ch == 1) -> dat
dat$N <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
count(dec) %>%
unname())[[2]]
dat$ch_perc <- dat$n / dat$N
dat$GreaterThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(max(!! rlang::sym(names(dt)[i]))))[[2]]
dat$LessThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(min(!! rlang::sym(names(dt)[i]))))[[2]]
dat$Varname <- rep(colnames(dt[i]), nrow(dat))
finaldata <- rbind(finaldata, dat)
}
finaldata
# A tibble: 6 x 8
# ch dec n N ch_perc GreaterThan LessThan Varname
# <int> <int> <int> <int> <dbl> <dbl> <dbl> <chr>
#1 1 1 4 8 0.5 25 11 drop
#2 1 2 5 7 0.714 40 25 drop
#3 1 1 5 8 0.625 18 2 drop1
#4 1 2 4 7 0.571 70 20 drop1
#5 1 1 5 8 0.625 90 1 drop2
#6 1 2 4 7 0.571 440 90 drop2
data
dt <- structure(list(drop = c(15L, 20L, 35L, 40L, 25L, 30L, 11L, 13L,
16L, 19L, 25L, 35L, 40L, 17L, 30L), drop1 = c(14L, 15L, 16L,
17L, 18L, 18L, 20L, 36L, 70L, 40L, 45L, 30L, 40L, 11L, 2L), drop2 = c(40L,
45L, 90L, 70L, 80L, 90L, 100L, 11L, 220L, 440L, 1L, 70L, 230L,
170L, 160L), ch = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("drop", "drop1", "drop2", "ch"),
class = "data.frame", row.names = c(NA,
-15L))
Related
I am looking to extract timepoints from a table.
Output should be the starting point in seconds from column 2 and the duration of the series. But output only if the stage lasts for at least 3 minutes ( if you look at the seconds column) so repetition of either stage 0,1,2,3 or 5 for more than 6 consecutive lines of the stage column.
So in this case the 0-series does not qualify, while the following 1-series does.
desired output would be : 150, 8
starting at timepoint 150 and lasting for 8 rows.
I was experimenting with rle(), but haven't been successful yet..
Stage
Seconds
0
0
0
30
0
60
0
90
0
120
1
150
1
180
1
210
1
240
1
270
1
300
1
330
1
360
1
390
0
420
Not sure how representative of your data this might be. This may be an option using dplyr
library(dplyr)
df %>%
mutate(grp = c(0, cumsum(abs(diff(stage))))) %>%
filter(stage == 1) %>%
group_by(grp) %>%
mutate(count = n() - 1) %>%
filter(row_number() == 1, count >= 6) %>%
ungroup() %>%
select(-c(grp, stage))
#> # A tibble: 4 x 2
#> seconds count
#> <dbl> <dbl>
#> 1 960 16
#> 2 1500 7
#> 3 2040 17
#> 4 2670 10
Created on 2021-09-23 by the reprex package (v2.0.0)
data
set.seed(123)
df <- data.frame(stage = sample(c(0, 1), 100, replace = TRUE, prob = c(0.2, 0.8)),
seconds = seq(0, by = 30, length.out = 100))
Similar to this answer, you can use data.table::rleid() with dplyr
df <- structure(list(Stage = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L), Seconds = c(0L, 30L, 60L, 90L, 120L,
150L, 180L, 210L, 240L, 270L, 300L, 330L, 360L, 390L, 420L)), class = "data.frame", row.names = c(NA,
-15L))
library(dplyr)
library(data.table)
df %>%
filter(Seconds > 0) %>%
group_by(grp = rleid(Stage)) %>%
filter(n() > 6)
#> # A tibble: 9 x 3
#> # Groups: grp [1]
#> Stage Seconds grp
#> <int> <int> <int>
#> 1 1 150 2
#> 2 1 180 2
#> 3 1 210 2
#> 4 1 240 2
#> 5 1 270 2
#> 6 1 300 2
#> 7 1 330 2
#> 8 1 360 2
#> 9 1 390 2
Created on 2021-09-23 by the reprex package (v2.0.0)
I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))
The pattern my data is like this
df1<-read.table(text="Car1 Car2 Car3 Time1 Time2 Time3
22 33 90 20 90 20
11 45 88 10 80 30
22 33 40 40 10 10
11 45 40 10 10 40
11 45 88 10 12 60
22 45 90 60 20 100",header=TRUE)
I want to calculate mean and SD based on Car and time. The point is Car 1 corresponds to Time1, Car2 corresponds to Time 2 and Car3 Corresponds to Time3 and so on.
I want to get the following table :
Car1 Mean SD
11 10 0
22 40 20
Car2
33 xx xx
45 xx xx
Car3
40 xx xx
88 xx xx
90 xx xx
I have tried:
df1 %>% group_by(Car1,Car2,Car3) %>%
summarise(mean=mean(Time,SD=sd(Time))
Unfortunately, it does not work. Any help?
You can also use the package data.table:
library(data.table)
melt(setDT(df1),
measure = patterns("Car", "Time"),
value.name = c("Car", "Time"),
variable.name = "group"
)[, .(Mean = mean(Time), Sd = sd(Time)), .(group, Car)]
# group Car Mean Sd
# 1: 1 22 40.0 20.00000
# 2: 1 11 10.0 0.00000
# 3: 2 33 50.0 56.56854
# 4: 2 45 30.5 33.28163
# 5: 3 90 60.0 56.56854
# 6: 3 88 45.0 21.21320
# 7: 3 40 25.0 21.21320
Here is one option with pivot_longer where we reshape from 'wide' to 'long' format and group by the 'group1' index and 'Car', get the mean and sd of 'Time' by summariseing the 'Time'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = everything(), names_to = c(".value", "group"),
names_sep="(?<=[a-z])(?=\\d+)") %>%
group_by(group, Car) %>%
summarise(Mean = mean(Time), SD = sd(Time))
# A tibble: 7 x 4
# Groups: group [3]
# group Car Mean SD
# <chr> <int> <dbl> <dbl>
#1 1 11 10 0
#2 1 22 40 20
#3 2 33 50 56.6
#4 2 45 30.5 33.3
#5 3 40 25 21.2
#6 3 88 45 21.2
#7 3 90 60 56.6
Assuming you can easily segregate your data into Time and Cars, then you can do this using loop, assuming you have data into structure as provided by you.
cars <- df1[1:3]
Time <- df1[4:6]
ls <- list()
for(i in 1:ncol(cars)) {
ls[[i]] <- aggregate(Time[i], by = cars[i], FUN = function(x) c(mean(x), sd(x)))
}
ls
Data for the results is:
df1 <- structure(list(Car1 = c(22L, 11L, 22L, 11L, 11L, 22L), Car2 = c(33L,
45L, 33L, 45L, 45L, 45L), Car3 = c(90L, 88L, 40L, 40L, 88L, 90L
), Time1 = c(20L, 10L, 40L, 10L, 10L, 60L), Time2 = c(90L, 80L,
10L, 10L, 12L, 20L), Time3 = c(20L, 30L, 10L, 40L, 60L, 100L)), class = "data.frame", row.names = c(NA,
-6L))
lapply(split.default(df1, gsub("\\D+", "", names(df1))), function(x){
d = gsub("\\D+", "", names(x)[1])
x %>%
group_by(!!sym(paste0("Car", d))) %>%
summarise(mean = mean(!!sym(paste0("Time", d))),
sd = sd(!!sym(paste0("Time", d)))) %>%
ungroup()
})
I have a data frame like this:
ID TIME AMT CONC
1 0 10 2
1 1 0 1
1 5 20 15
1 10 0 30
1 12 0 16
I want to subset data for each subject ID, from the last time when AMT > 0 till the last row of the data frame for that individual.
output should be this:
ID TIME AMT CONC
1 5 20 15
1 10 0 30
1 12 0 16
I am using RStudio.
We can use slice and create a sequence between the max index where AMT > 0 and the last index for each ID.
library(dplyr)
df %>%
group_by(ID) %>%
slice(max(which(AMT > 0)) : n())
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
We can use filter
library(dplyr)
df %>%
group_by(ID) %>%
mutate(ind = cumsum(AMT > 0)) %>%
filter(ind == max(ind), ind > 0) %>%
select(-ind)
# A tibble: 3 x 4
# Groups: ID [1]
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
NOTE: This also works well when all the elements of 'AMT' is 0 for a particular group
df$ID[4:5] <- 2
df$AMT <- 0
df$AMT[4:5] <- c(1, 0)
Or another option is fewer steps
df %>%
group_by(ID) %>%
filter(row_number() >= which.max(cumsum(AMT > 0)))
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L), TIME = c(0L, 1L, 5L,
10L, 12L), AMT = c(10L, 0L, 20L, 0L, 0L), CONC = c(2L, 1L, 15L,
30L, 16L)), class = "data.frame", row.names = c(NA, -5L))
structure(list(group = c(17L, 17L, 17L, 18L, 18L, 18L, 18L, 19L,
19L, 19L, 20L, 20L, 20L, 21L, 21L, 22L, 23L, 24L, 25L, 25L, 25L,
26L, 27L, 27L, 27L, 28L), var = c(74L, 49L, 1L, 74L, 1L, 49L,
61L, 49L, 1L, 5L, 5L, 1L, 44L, 44L, 12L, 13L, 5L, 5L, 1L, 1L,
4L, 4L, 1L, 1L, 1L, 49L), first = c(0, 0, 1, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0)), .Names = c("group",
"var", "first"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-26L))
With the data from the first two column I would like to create a third column (called first) where first == 1 only when var == 1 for the first time in a group. In other words I would like to mark first elements within group that fullfil var == 1. How can I do that in dplyr? Certainly group_by should be used but what next?
library(dplyr)
df$first = NULL
df %>%
group_by(group) %>%
mutate(first = as.numeric(row_number() == min(row_number()[var == 1]))) %>%
ungroup()
# # A tibble: 26 x 3
# group var first
# <int> <int> <dbl>
# 1 17 74 0
# 2 17 49 0
# 3 17 1 1
# 4 18 74 0
# 5 18 1 1
# 6 18 49 0
# 7 18 61 0
# 8 19 49 0
# 9 19 1 1
# 10 19 5 0
# # ... with 16 more rows
The idea is to flag the minimum row number where var = 1, within each group.
This will return some warnings, because in some groups there are no var = 1 cases.
Another option would be this:
library(dplyr)
df$first = NULL
# create row id
df$id = seq_along(df$group)
df %>%
filter(var == 1) %>% # keep cases where var = 1
distinct(group, .keep_all = T) %>% # keep distinct cases based on group
mutate(first = 1) %>% # create first column
right_join(df, by=c("id","group","var")) %>% # join back original dataset
mutate(first = coalesce(first, 0)) %>% # replace NAs with 0
select(-id) # remove row id
# # A tibble: 26 x 3
# group var first
# <int> <int> <dbl>
# 1 17 74 0
# 2 17 49 0
# 3 17 1 1
# 4 18 74 0
# 5 18 1 1
# 6 18 49 0
# 7 18 61 0
# 8 19 49 0
# 9 19 1 1
#10 19 5 0
# # ... with 16 more rows
For ungrouped data, one solution is
first_equal_to = function(x, value)
(x == value) & (cumsum(x == value) == 1)
so
tbl %>% group_by(group) %>% mutate(first = first_equal_to(var, 1))
(it seems appropriate to keep this as a logical vector, since that is what the column represents).
Another implementation is
first_equal_to2 = function(x, value) {
result = logical(length(x))
result[match(value, x)] = TRUE
result
}
We can use the expression shown for first:
DF %>%
group_by(group) %>%
mutate(first = { var == 1 } %>% { . * !duplicated(.) } ) %>%
ungroup
giving:
# A tibble: 26 x 3
group var first
<int> <int> <int>
1 17 74 0
2 17 49 0
3 17 1 1
4 18 74 0
5 18 1 1
6 18 49 0
7 18 61 0
8 19 49 0
9 19 1 1
10 19 5 0
# ... with 16 more rows