Related
I have a dataframe with group, value and columns based in rollaply mean of the last n values, just like that:
library(dplyr); library(zoo)
df = data.frame( group = c(rep(1,5), rep(2,5)),
value = c(23,14,53,12,56,32,65,76,36,74)) %>%
group_by(group) %>%
mutate(
roll1 = rollapplyr(value, 1, mean, fill = NA, na.rm = T, partial = F),
roll2 = rollapplyr(value, 2, mean, fill = NA, na.rm = T, partial = F),
roll3 = rollapplyr(value, 3, mean, fill = NA, na.rm = T, partial = F)
)
df
group value roll1 roll2 roll3
1 1 23 23 NA NA
2 1 14 14 18.5 NA
3 1 53 53 33.5 30
4 1 12 12 32.5 26.3
5 1 56 56 34 40.3
6 2 32 32 NA NA
7 2 65 65 48.5 NA
8 2 76 76 70.5 57.7
9 2 36 36 56 59
10 2 74 74 55 62
The 'rolln' column represents the average of the last n values.
Then I would like to summarize in a new dataframe which group of values provided the highest average. Remembering that the roll3 column, for example, has a set of 3 values.
I tried to use which.max function, but without success. The position of NA's in the final data.frame isn't important
Thanks in advance
I'd love to see a more concise solution, but this seems to work:
library(tidyverse)
df %>%
pivot_longer(starts_with("roll"), values_to = "avg") %>%
filter(!is.na(avg)) %>%
group_by(group, name) %>%
filter(slider::slide_dbl(avg, max, .after = 2) == max(avg)) %>% # EDIT #2
#filter(avg == max(avg) |
# lead(avg, default = 0) == max(avg) |
# lead(avg, 2, default = 0) == max(avg)) %>%
mutate(items = n() + 1 - parse_number(name)) %>% # EDIT
slice(items:n()) %>%
mutate(row = row_number()) %>%
select(-avg, -items) %>%
pivot_wider(names_from = name, values_from = value)
Result
group row roll1 roll2 roll3
<dbl> <int> <dbl> <dbl> <dbl>
1 1 1 56 12 53
2 1 2 NA 56 12
3 1 3 NA NA 56
4 2 1 76 65 76
5 2 2 NA 76 36
6 2 3 NA NA 74
library(tidyverse)
set.seed(10)
dat <- data.frame(age = sample(14:79, size = 15, replace = TRUE),
sex = sample(c("m", "f"), size = 15, replace = TRUE),
region = sample(c("A", "B", "C"), size = 15, replace = TRUE),
var1 = runif(15, min = 0, max=100)) %>%
mutate(agegrp = cut(age, breaks = c(-Inf, 20, 50, 70, Inf), labels = c("<= 20", "21-50", "51-70", ">70")))
The table looks like this:
age sex region var1 agegrp
1 79 m A 64.669975 >70
2 78 m C 92.789062 >70
3 23 m A 69.626845 21-50
4 25 m C 5.074013 21-50
5 60 f C 10.340510 51-70
6 36 f B 90.294240 21-50
7 23 m A 12.769088 21-50
8 27 f A 43.892321 21-50
9 35 f B 99.793467 21-50
10 40 f C 94.284903 21-50
11 25 m A 98.829001 21-50
12 55 m A 98.007185 51-70
13 43 f A 37.491168 21-50
14 68 m A 90.051414 51-70
15 76 f B 13.567239 >70
Unfortunately, our customer needs the data to be in a pretty weird format like this:
split value var1_mean
1 agegrp 21-50 61.3
2 agegrp 51-70 66.1
3 agegrp >70 57.0
4 sex m 55.7
5 sex f 66.5
6 region A 64.4
7 region B 67.9
8 region C 50.6
I can easily do this using the following code, but it is very unelegant:
age <- dat %>% group_by(agegrp) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = agegrp,
split = "agegrp") %>%
select(split, value, var1_mean)
sex <- dat %>% group_by(sex) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = "sex",
split = "sex") %>%
select(split, value, var1_mean)
region <- dat %>% group_by(region) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = "region",
split = "region") %>%
select(split, value, var1_mean)
rbind(age, sex, region)
Is there a way to make this easier without "stacking" several tables manually (maybe using dplyr)?
You can pivot to long and summarise or, alternatively, iterate over the vars of interest:
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
pivot_longer(-c(var1, age), names_to = "split") %>%
group_by(split, value) %>%
summarise(var1_mean = mean(var1))
Or:
map_df(set_names(c("agegrp", "sex", "region")), ~ dat %>%
group_by(across(.x)) %>%
summarise(var1_mean = mean(var1)) %>%
rename(value = .x), .id = "split")
# A tibble: 8 × 3
split value var1_mean
<chr> <chr> <dbl>
1 agegrp <= 20 54.6
2 agegrp 21-50 44.7
3 agegrp 51-70 46.4
4 sex f 37.4
5 sex m 55.5
6 region A 67.3
7 region B 47.9
8 region C 26.1
Loop through columns and aggregate, then rowbind the results:
do.call(rbind,
lapply(c("agegrp", "sex", "region"), function(i){
cbind(split = i,
setNames(aggregate(as.formula(paste("age ~", i)), mean, data = dat),
c("value", "var1_mean")))
}))
# split value var1_mean
# 1 agegrp <= 20 20.00000
# 2 agegrp 21-50 33.30000
# 3 agegrp 51-70 60.50000
# 4 sex f 39.62500
# 5 sex m 39.71429
# 6 region A 35.75000
# 7 region B 39.00000
# 8 region C 43.60000
Note: output is different, probably because the seeded data and shown data do not match.
How can I do the following:
replace all values < 6 with NA,
if there is only one NA in the row, replace the first instance of the minimum value with -99?
Some data that includes an ID variable and a total column:
library(tidyverse)
df <- data.frame(id = c(1,2,3,4,5), a = c(10,12,4,17,3), b = c(9,12,3,20,6), c = c(2,2,10,10,10), d = c(12,16,12,10,12))
df$total <- apply(df[,c(2:5)], 1, sum)
Giving
id a b c d total
1 10 9 2 12 33
2 12 12 2 16 42
3 4 3 10 12 29
4 17 20 10 10 57
5 3 6 10 12 31
My desired output is
id a b c d total
1 10 -99 NA 12 33
2 -99 12 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
My attempt
df_mod <- df %>%
# Make <6 NA
mutate(
across(
.cols = 'a':'total',
~case_when(
.x < 6 ~ as.numeric(NA),
TRUE ~ .x
)
)
) %>%
# Add a count of NAs
rowwise() %>%
mutate(Count_NA = sum(is.na(cur_data()))) %>%
ungroup()
# Transpose and get row minimum
df_mod2 <- t(df_mod[,-c(1,ncol(df_mod))]) %>%
apply(., 2, function(a){
min <- min(a, na.rm = TRUE)
}
) %>%
cbind(df_mod, .) %>%
rename(., min = .) %>%
tibble(.)
# If count_NA = 1 replace the first instance of min
df_mod2 %>%
rowwise() %>%
mutate(
across(
.cols = 'a':'total',
~case_when(
Count_NA == 1 & .x == min ~ replace(.x, first(match(min, .x)), -99),
TRUE ~ .x)
)
) %>%
select(-'Count_NA', -'min')
Which gives the following
id a b c d total
1 10 -99 NA 12 33
2 -99 -99 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
Thanks
If you're willing to pivot rather than work rowwise, then this solution will work.
library(dplyr)
df %>%
pivot_longer(names_to = 'col',
values_to = 'val',
-c(id, total)) %>%
group_by(id) %>%
mutate(val2 = rank(val, ties.method = 'first'),
val = ifelse(val < 6, NA , val),
val = ifelse(sum(is.na(val)) == 1 & val2 == 2, -99, val)) %>%
select(-val2) %>%
pivot_wider(names_from = col,
values_from = val) %>%
relocate(total, .after = "d")
Here's the result:
# A tibble: 5 × 6
# Groups: id [5]
id a b c d total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10 -99 NA 12 33
2 2 -99 12 NA 16 42
3 3 NA NA 10 12 29
4 4 17 20 10 10 57
5 5 NA -99 10 12 31
It is not clear what you mean by '2nd' minimum value because you replace minimum value. You can use data.table:
library(data.table)
setDT(df)[
,
(cols) := transpose(
lapply(
transpose(lapply(.SD, function(x) fifelse(x < 6, NA_real_, x))),
function(x) if(sum(is.na(x)) == 1) replace(x, which.min(x), -99) else x
)
),
.SDcols = setdiff(names(df), "id")
]
I'm trying to group a dataset and get the first and highest values based on two separate measures of time and speed. So I need the time and speed for the earliest record in each group and then the time and speed for the fastest record in each group. I've got this far but need some help...
library(tidyverse)
group <- c(1,1,1,1,1,2,2,3,3,4,4,4,4,4,4)
time <- c(1,6,4,5,7,12,10,2,3,8,9,11,13,14,15)
speed <- c(17,6, 99, 34, 12, 5, 67, 43, 23, 12, 15, 78, 61, 78, 20)
data = data.frame(group, time, speed)
summary = data %>%
group_by(group) %>%
summarise(
firstTime = # lowest time
HighestSpeedTime = , # time for highest speed
firstSpeed = , #speed for lowest time
highestSpeed = max(speed), # highest speed
)
Update:
This should work: In group 4 we have ties therefore 2 rows:(we have at two time points the highest speed)!
library(dplyr)
data %>%
group_by(group) %>%
summarise(
firstTime = min(time), # lowest time
HighestSpeedTime = time[which(speed==max(speed))], # time for highest speed
firstSpeed = speed[which(time==min(time))],#speed for lowest time
highestSpeed = max(speed) # highest speed
)
output:
group firstTime HighestSpeedTime firstSpeed highestSpeed
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 4 17 99
2 2 10 10 67 67
3 3 2 2 43 43
4 4 8 11 12 78
5 4 8 14 12 78
Does this work?
library(tidyverse)
group <- c(1,1,1,1,1,2,2,3,3,4,4,4,4,4,4)
time <- c(1,6,4,5,7,12,10,2,3,8,9,11,13,14,15)
speed <- c(17,6, 99, 34, 12, 5, 67, 43, 23, 12, 15, 78, 61, 78, 20)
data = data.frame(group, time, speed)
summary <- data |>
arrange(group, time) |>
group_by(group) |>
summarise(
firsttime = min(time),
highest_speed = max(speed)
) |>
left_join(data, by = c("group", "highest_speed" = "speed")) |>
group_by(group) |>
slice(1) |>
rename(highest_speed_time = time) |>
left_join(data, by = c("group", "firsttime" = "time")) |>
rename(first_speed = speed)
summary
# group firsttime highest_speed highest_speed_time first_speed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 99 4 17
# 2 10 67 10 67
# 3 2 43 2 43
# 4 8 78 11 12
here is a data.table approach
library(data.table)
setDT(data)
temp <- data[data[, .I[speed == max(speed)], by = .(group)]$V1]
setnames(temp, new = c("group", "maxSpeedTime", "maxSpeed"))
# join together
data[, .(firstTime = time[1],
firstSpeed = speed[1]),
by = .(group)][temp, on = .(group)]
# group firstTime firstSpeed maxSpeedTime maxSpeed
# 1: 1 1 17 4 99
# 2: 2 12 5 10 67
# 3: 3 2 43 2 43
# 4: 4 8 12 11 78
# 5: 4 8 12 14 78
Another solution, with a chained inner_join:
library(tidyverse)
data %>%
group_by(group) %>%
summarise(firstTime = min(time)) %>%
inner_join(data,by=c("group", "firstTime"="time")) %>%
rename(firstSpeed=speed) %>%
inner_join(
data %>%
group_by(group) %>%
summarise(highestSpeed = max(speed)) %>%
inner_join(data,by=c("group", "highestSpeed"="speed"))
) %>%
relocate(highestTime=time, .before="highestSpeed")
#> Joining, by = "group"
#> # A tibble: 5 × 5
#> group firstTime firstSpeed highestTime highestSpeed
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
An alternative solution, based on purrr::map_dfr:
library(tidyverse)
data %>%
group_split(group) %>%
map_dfr(
~ data.frame(
group = .x$group[1],
firstTime = .x$time[min(.x$time) == .x$time],
firstSpeed = .x$speed[min(.x$time) == .x$time],
highestTime = .x$time[max(.x$speed) == .x$speed],
highestSpeed = .x$speed[max(.x$speed) == .x$speed]))
#> group firstTime firstSpeed highestTime highestSpeed
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
And more succinctly:
library(tidyverse)
data %>%
group_split(group) %>%
map_dfr(~ data.frame(
group = integer(), firstTime = integer(), firstSpeed = integer(),
highestTime = integer(), highestSpeed = integer()) %>%
add_row(!!!setNames(c(.x$group[1],.x[min(.x$time) == .x$time, -1],
.x[max(.x$speed) == .x$speed, -1]), names(.))))
#> group firstTime firstSpeed highestTime highestSpeed
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
Here is a reproducible example of the situation I need help for. I have a database (db1) in which weekly ratings of behavioral outcomes are recorded. The variable "Week" corresponds to the number of the week from the beginning of the year (e.g., Week = 1 indicates the week between January 1st and 7th, and so on...) and the variable "Score" to the value obtained by the subject on the criterion measure. In the real data set, I have several participants and a different number of ratings for each subject; however, in this example there is only one subject to make things easier.
library(magrittr)
x1 <- c(14, 18, 19, 20, 21, 23, 24, 25)
y1 <- c(34, 21, 45, 32, 56, 45, 23, 48)
db1 <- cbind(x1, y1) %>% as.data.frame() %>% setNames(c("Week", "Score"))
db1
# Week Score
#1 14 34
#2 18 21
#3 19 45
#4 20 32
#5 21 56
#6 23 45
#7 24 23
#8 25 48
What I need to do is to identify the highest number of ratings occurred in consecutive weeks in the database. In the example, the highest number is 4 because the ratings were consecutive from week 18 to 21. Here I added a column for demonstration, but it might not be necessary for the solution.
x2 <- c(14, 18, 19, 20, 21, 23, 24, 25)
y2 <- c(34, 21, 45, 32, 56, 45, 23, 48)
z2 <- c(1, 1, 2, 3, 4, 1, 2, 3)
db2 <- cbind(x2, y2, z2) %>% as.data.frame() %>% setNames(c("Week", "Score", "Consecutive"))
db2
# Week Score Consecutive
#1 14 34 1
#2 18 21 1
#3 19 45 2
#4 20 32 3
#5 21 56 4
#6 23 45 1
#7 24 23 2
#8 25 48 3
Finally, because every subject has to have a total of five consecutive ratings, I need to add a row with a missing datum where the highest number of consecutive weeks is below five (so that I can impute the missing data later on). However, there might be ratings before and after the sequence. If that is the case, I want to add the row based on the minimal distance between the first or last week of the longest series of consecutive weeks from the other existing rating. In the example, that means that the row with missing datum will be added after 21 because there are 4 missing weeks between week 14 and 18 whereas only 1 between week 21 and 23.
x3 <- c(14, 18, 19, 20, 21, 22, 23, 24, 25)
y3 <- c(34, 21, 45, 32, 56, NA, 45, 23, 48)
z3 <- c(1, 1, 2, 3, 4, 5, 1, 2, 3)
db3 <- cbind(x3, y3, z3) %>% as.data.frame() %>% setNames(c("Week", "Score", "Consecutive"))
db3
# Week Score Consecutive
#1 14 34 1
#2 18 21 1
#3 19 45 2
#4 20 32 3
#5 21 56 4
#6 22 NA 5
#7 23 45 1
#8 24 23 2
#9 25 48 3
For your information, this is not going to be part of the main statistical analyses but rather one of several ways I want to use to test the sensitivity of my model. So do not worry about whether it makes sense from a methodological point of view. In addition, if possible, a tidyverse solution would be greatly appreciated.
Thanks so much to anyone who will take the time.
The code is relatively easier, if you want to do it just for max group and if more than one, just for one.
db1 %>% mutate(consecutive = accumulate(diff(Week), .init = 1, ~if(.y == 1) { .x +1} else {1}),
dummy = max(consecutive) == consecutive & max(consecutive) < 5) %>%
group_by(grp = cumsum(consecutive == 1)) %>%
filter(sum(dummy) > 0) %>% #filter out group(s) with max consecutive
ungroup() %>% select(-dummy) %>%
filter(grp == min(grp)) %>% # filter out first such group, if there are more than 1
complete(consecutive = 1:5) %>%
select(-grp) %>%
mutate(Week = first(Week) + consecutive -1)
# A tibble: 5 x 3
consecutive Week Score
<dbl> <dbl> <dbl>
1 1 18 21
2 2 19 45
3 3 20 32
4 4 21 56
5 5 22 NA
OLD ANSWER Another tidyverse strategy (this can be modified to suit your additional column requirements which you have not given in sample)
library(tidyverse)
db1
#> Week Score
#> 1 14 34
#> 2 18 21
#> 3 19 45
#> 4 20 32
#> 5 21 56
#> 6 23 45
#> 7 24 23
#> 8 25 48
library(data.table)
db1 %>% mutate(consecutive = accumulate(diff(Week), .init = 1, ~if(.y == 1) { .x +1} else {1}),
dummy = max(consecutive) == consecutive & max(consecutive) < 5,
dummy2 = rleid(dummy)) %>%
group_split(dummy2, .keep = F) %>%
map_if( ~.x$dummy[[1]], ~.x %>% complete(consecutive = seq(max(consecutive), 5, 1), fill = list(Week = 1)) %>%
mutate(Week = cumsum(Week))) %>%
map_dfr(~.x %>% select(-dummy))
#> # A tibble: 9 x 3
#> Week Score consecutive
#> <dbl> <dbl> <dbl>
#> 1 14 34 1
#> 2 18 21 1
#> 3 19 45 2
#> 4 20 32 3
#> 5 21 56 4
#> 6 22 NA 5
#> 7 23 45 1
#> 8 24 23 2
#> 9 25 48 3
Created on 2021-06-10 by the reprex package (v2.0.0)
if I understand correctly
library(data.table)
library(tidyverse)
x1 <- c(14, 18, 19, 20, 21, 23, 24, 25)
y1 <- c(34, 21, 45, 32, 56, 45, 23, 48)
db1 <- cbind(x1, y1) %>% as.data.frame() %>% setNames(c("Week", "Score"))
db1 %>%
mutate(grp = cumsum(c(0, diff(Week)) > 1)) %>%
group_by(grp) %>%
mutate(n_grp = n()) %>%
ungroup() %>%
filter(n_grp == max(n_grp, na.rm = TRUE)) %>%
complete(grp,
n_grp,
nesting(Week = seq(from = first(Week), length = 5))) %>%
select(-c(grp, n_grp)) %>%
rows_upsert(db1, by = c("Week", "Score"))
#> # A tibble: 9 x 2
#> Week Score
#> <dbl> <dbl>
#> 1 18 21
#> 2 19 45
#> 3 20 32
#> 4 21 56
#> 5 22 NA
#> 6 14 34
#> 7 23 45
#> 8 24 23
#> 9 25 48
Created on 2021-06-10 by the reprex package (v2.0.0)
You can also use the following solution. Midway through this solution before we use add_row to add your additional rows, we can filter the whole data set for we use group_split I filtered the whole data set to keep only those groups with the maximum observations which means they have longer consecutive Weeks than others. So after we split by grouping variable we may end of with 2 or more groups of equal consecutive Weeks so then you can choose whichever your like based on your preference:
library(dplyr)
library(purrr)
library(tibble)
db1 %>%
mutate(Consecutive = +(Week - lag(Week, default = first(Week)) == 1),
grp = cumsum(Consecutive == 0)) %>%
group_by(grp) %>%
mutate(Consecutive = row_number()) %>%
group_by(grp, .drop = TRUE) %>%
add_count() %>%
ungroup() -> db2 # We create our grouping variable `grp` here
db2 %>%
filter(n == max(n)) %>%
group_split(grp) %>%
map_dfr(~ add_row(.x, Week = .x$Week[.x$n[1]] + seq(1, 5 - .x$n[1], 1),
Consecutive = .x$Consecutive[.x$n[1]] + seq(1, 5 - .x$n[1], 1),
grp = .x$grp[1])) %>%
bind_rows(db2 %>%
filter(n != max(n))) %>%
select(-c(grp, n)) %>%
arrange(Week)
# A tibble: 9 x 3
Week Score Consecutive
<dbl> <dbl> <dbl>
1 14 34 1
2 18 21 1
3 19 45 2
4 20 32 3
5 21 56 4
6 22 NA 5
7 23 45 1
8 24 23 2
9 25 48 3