Select a group of n elements by position of the max in a data frame in r by group - r

I have a dataframe with group, value and columns based in rollaply mean of the last n values, just like that:
library(dplyr); library(zoo)
df = data.frame( group = c(rep(1,5), rep(2,5)),
value = c(23,14,53,12,56,32,65,76,36,74)) %>%
group_by(group) %>%
mutate(
roll1 = rollapplyr(value, 1, mean, fill = NA, na.rm = T, partial = F),
roll2 = rollapplyr(value, 2, mean, fill = NA, na.rm = T, partial = F),
roll3 = rollapplyr(value, 3, mean, fill = NA, na.rm = T, partial = F)
)
df
group value roll1 roll2 roll3
1 1 23 23 NA NA
2 1 14 14 18.5 NA
3 1 53 53 33.5 30
4 1 12 12 32.5 26.3
5 1 56 56 34 40.3
6 2 32 32 NA NA
7 2 65 65 48.5 NA
8 2 76 76 70.5 57.7
9 2 36 36 56 59
10 2 74 74 55 62
The 'rolln' column represents the average of the last n values.
Then I would like to summarize in a new dataframe which group of values ​​provided the highest average. Remembering that the roll3 column, for example, has a set of 3 values.
I tried to use which.max function, but without success. The position of NA's in the final data.frame isn't important
Thanks in advance

I'd love to see a more concise solution, but this seems to work:
library(tidyverse)
df %>%
pivot_longer(starts_with("roll"), values_to = "avg") %>%
filter(!is.na(avg)) %>%
group_by(group, name) %>%
filter(slider::slide_dbl(avg, max, .after = 2) == max(avg)) %>% # EDIT #2
#filter(avg == max(avg) |
# lead(avg, default = 0) == max(avg) |
# lead(avg, 2, default = 0) == max(avg)) %>%
mutate(items = n() + 1 - parse_number(name)) %>% # EDIT
slice(items:n()) %>%
mutate(row = row_number()) %>%
select(-avg, -items) %>%
pivot_wider(names_from = name, values_from = value)
Result
group row roll1 roll2 roll3
<dbl> <int> <dbl> <dbl> <dbl>
1 1 1 56 12 53
2 1 2 NA 56 12
3 1 3 NA NA 56
4 2 1 76 65 76
5 2 2 NA 76 36
6 2 3 NA NA 74

Related

Restructure data - get mean based on multiple grouping columns

library(tidyverse)
set.seed(10)
dat <- data.frame(age = sample(14:79, size = 15, replace = TRUE),
sex = sample(c("m", "f"), size = 15, replace = TRUE),
region = sample(c("A", "B", "C"), size = 15, replace = TRUE),
var1 = runif(15, min = 0, max=100)) %>%
mutate(agegrp = cut(age, breaks = c(-Inf, 20, 50, 70, Inf), labels = c("<= 20", "21-50", "51-70", ">70")))
The table looks like this:
age sex region var1 agegrp
1 79 m A 64.669975 >70
2 78 m C 92.789062 >70
3 23 m A 69.626845 21-50
4 25 m C 5.074013 21-50
5 60 f C 10.340510 51-70
6 36 f B 90.294240 21-50
7 23 m A 12.769088 21-50
8 27 f A 43.892321 21-50
9 35 f B 99.793467 21-50
10 40 f C 94.284903 21-50
11 25 m A 98.829001 21-50
12 55 m A 98.007185 51-70
13 43 f A 37.491168 21-50
14 68 m A 90.051414 51-70
15 76 f B 13.567239 >70
Unfortunately, our customer needs the data to be in a pretty weird format like this:
split value var1_mean
1 agegrp 21-50 61.3
2 agegrp 51-70 66.1
3 agegrp >70 57.0
4 sex m 55.7
5 sex f 66.5
6 region A 64.4
7 region B 67.9
8 region C 50.6
I can easily do this using the following code, but it is very unelegant:
age <- dat %>% group_by(agegrp) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = agegrp,
split = "agegrp") %>%
select(split, value, var1_mean)
sex <- dat %>% group_by(sex) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = "sex",
split = "sex") %>%
select(split, value, var1_mean)
region <- dat %>% group_by(region) %>%
summarise(var1_mean = mean(var1)) %>%
mutate(value = "region",
split = "region") %>%
select(split, value, var1_mean)
rbind(age, sex, region)
Is there a way to make this easier without "stacking" several tables manually (maybe using dplyr)?
You can pivot to long and summarise or, alternatively, iterate over the vars of interest:
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
pivot_longer(-c(var1, age), names_to = "split") %>%
group_by(split, value) %>%
summarise(var1_mean = mean(var1))
Or:
map_df(set_names(c("agegrp", "sex", "region")), ~ dat %>%
group_by(across(.x)) %>%
summarise(var1_mean = mean(var1)) %>%
rename(value = .x), .id = "split")
# A tibble: 8 × 3
split value var1_mean
<chr> <chr> <dbl>
1 agegrp <= 20 54.6
2 agegrp 21-50 44.7
3 agegrp 51-70 46.4
4 sex f 37.4
5 sex m 55.5
6 region A 67.3
7 region B 47.9
8 region C 26.1
Loop through columns and aggregate, then rowbind the results:
do.call(rbind,
lapply(c("agegrp", "sex", "region"), function(i){
cbind(split = i,
setNames(aggregate(as.formula(paste("age ~", i)), mean, data = dat),
c("value", "var1_mean")))
}))
# split value var1_mean
# 1 agegrp <= 20 20.00000
# 2 agegrp 21-50 33.30000
# 3 agegrp 51-70 60.50000
# 4 sex f 39.62500
# 5 sex m 39.71429
# 6 region A 35.75000
# 7 region B 39.00000
# 8 region C 43.60000
Note: output is different, probably because the seeded data and shown data do not match.

R rowwise replace the first instance of the minimum

How can I do the following:
replace all values < 6 with NA,
if there is only one NA in the row, replace the first instance of the minimum value with -99?
Some data that includes an ID variable and a total column:
library(tidyverse)
df <- data.frame(id = c(1,2,3,4,5), a = c(10,12,4,17,3), b = c(9,12,3,20,6), c = c(2,2,10,10,10), d = c(12,16,12,10,12))
df$total <- apply(df[,c(2:5)], 1, sum)
Giving
id a b c d total
1 10 9 2 12 33
2 12 12 2 16 42
3 4 3 10 12 29
4 17 20 10 10 57
5 3 6 10 12 31
My desired output is
id a b c d total
1 10 -99 NA 12 33
2 -99 12 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
My attempt
df_mod <- df %>%
# Make <6 NA
mutate(
across(
.cols = 'a':'total',
~case_when(
.x < 6 ~ as.numeric(NA),
TRUE ~ .x
)
)
) %>%
# Add a count of NAs
rowwise() %>%
mutate(Count_NA = sum(is.na(cur_data()))) %>%
ungroup()
# Transpose and get row minimum
df_mod2 <- t(df_mod[,-c(1,ncol(df_mod))]) %>%
apply(., 2, function(a){
min <- min(a, na.rm = TRUE)
}
) %>%
cbind(df_mod, .) %>%
rename(., min = .) %>%
tibble(.)
# If count_NA = 1 replace the first instance of min
df_mod2 %>%
rowwise() %>%
mutate(
across(
.cols = 'a':'total',
~case_when(
Count_NA == 1 & .x == min ~ replace(.x, first(match(min, .x)), -99),
TRUE ~ .x)
)
) %>%
select(-'Count_NA', -'min')
Which gives the following
id a b c d total
1 10 -99 NA 12 33
2 -99 -99 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
Thanks
If you're willing to pivot rather than work rowwise, then this solution will work.
library(dplyr)
df %>%
pivot_longer(names_to = 'col',
values_to = 'val',
-c(id, total)) %>%
group_by(id) %>%
mutate(val2 = rank(val, ties.method = 'first'),
val = ifelse(val < 6, NA , val),
val = ifelse(sum(is.na(val)) == 1 & val2 == 2, -99, val)) %>%
select(-val2) %>%
pivot_wider(names_from = col,
values_from = val) %>%
relocate(total, .after = "d")
Here's the result:
# A tibble: 5 × 6
# Groups: id [5]
id a b c d total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10 -99 NA 12 33
2 2 -99 12 NA 16 42
3 3 NA NA 10 12 29
4 4 17 20 10 10 57
5 5 NA -99 10 12 31
It is not clear what you mean by '2nd' minimum value because you replace minimum value. You can use data.table:
library(data.table)
setDT(df)[
,
(cols) := transpose(
lapply(
transpose(lapply(.SD, function(x) fifelse(x < 6, NA_real_, x))),
function(x) if(sum(is.na(x)) == 1) replace(x, which.min(x), -99) else x
)
),
.SDcols = setdiff(names(df), "id")
]

Make this dot plot in R

I am attempting to make the image shown here:
Using R code. Is there something better than dot plot for this type of figure? Basically I want to show the means and variability between two groups for 6 variables. Sample data provided below. Thanks a million!
id var1 var2 var3 var4 group
1 12 3 22 60 1
2 8 6 19 55 1
3 25 30 70 26 2
4 26 31 75 31 2
5 22 29 80 29 2
Here is an example that should get you started
library(tidyverse)
df %>%
gather(variable, v, -id, -group) %>%
group_by(group, variable) %>%
summarise(value = mean(v), value.sd = sd(v)) %>%
ungroup() %>%
mutate(
variable = as.factor(variable),
group = as.factor(group)) %>%
ggplot(aes(x = value, y = variable, shape = group)) +
geom_point(size = 4) +
geom_segment(aes(x = value - value.sd, xend = value + value.sd, yend = variable))
Sample data
df <- read.table(text =
"id var1 var2 var3 var4 group
1 12 3 22 60 1
2 8 6 19 55 1
3 25 30 70 26 2
4 26 31 75 31 2
5 22 29 80 29 2", header = T)

Wide to long data.frame merging two pairs (key/value) of columns

I have this data.frame
set.seed(28100)
label_1 <- sample(c('first_col','second_col'), 10, replace = T)
dat <- data.frame(label_1,
value_1 = sample(1:100, 10, replace = T),
label_2 = sapply(label_1, FUN = function(x) ifelse(x == 'first_col', 'second_col', 'first_col')),
value_2 = sample(1:100, 10, replace = T))
head(dat)
label_1 value_1 label_2 value_2
1 first_col 88 second_col 84
2 first_col 40 second_col 30
3 first_col 98 second_col 32
4 second_col 80 first_col 64
5 first_col 34 second_col 43
6 second_col 52 first_col 10
which has an inconsistent ordering of two pairs of key/value columns. I want to reshape the same data into a long format data.frame, such as:
desired_dat <- data.frame(first_col = rep(NA, 10),
second_col = rep(NA, 10))
Would be suggested to use reshape2 or tidyr for this problem? How exactly?
How about just using dplyr (no need for tidyr etc)?
library(dplyr)
dat %>% transmute(first_col = if_else(label_1 == "first_col", value_1, value_2),
second_col = if_else(label_2 == "second_col", value_2, value_1))
#> first_col second_col
#> 1 88 84
#> 2 40 30
#> 3 98 32
#> 4 64 80
#> 5 34 43
#> 6 10 52
#> 7 23 85
#> 8 65 86
#> 9 4 35
#> 10 83 8
This is basically #SymbolixAU's solution, just translated to dplyr:
# Create an ID for each row: probably not necessary but useful to check
dat <- dat %>%
mutate(id = row_number())
dat_long <- bind_rows(
dat %>% select(id, label = label_1, value = value_1),
dat %>% select(id, label = label_2, value = value_2)
)
output <- dat_long %>%
spread(label, value)
I would do this using data.table, although the same principals can be applied to the tidyverse as well
library(data.table)
## Setting as a data.table, and adding an 'id' value to keep track of rows
setDT(dat)
dat[, id := .I]
## then 'rbinding' the _1 and _2 columns together, with common column names
dat2 <- rbindlist(
list(
dat[, .(id, label = label_1, value = value_1)],
dat[, .(id, label = label_2, value = value_2)]
)
)
## the reshaping from long to wide to give you your desired result
dcast(dat2, formula = id ~ label)
# id first_col second_col
# 1: 1 88 84
# 2: 2 40 30
# 3: 3 98 32
# 4: 4 64 80
# 5: 5 34 43
# 6: 6 10 52
# 7: 7 23 85
# 8: 8 65 86
# 9: 9 4 35
# 10: 10 83 8
As of version v1.9.6 (on CRAN 19 Sep 2015), data.table can melt() multiple columns simultaneously. So this goes in one chain of data.table expressions:
library(data.table)
as.data.table(dat)[, rn := .I][
, melt(.SD, measure.vars = patterns("label", "value"))][
, dcast(.SD, rn ~ value1)][, -"rn"]
first_col second_col
1: 88 84
2: 40 30
3: 98 32
4: 64 80
5: 34 43
6: 10 52
7: 23 85
8: 65 86
9: 4 35
10: 83 8
This is a possible solution; but not the most elegant.
myFun <- function(label1, value1, label2, value2, which_label) {
return(ifelse(label1 == which_label, value1, value2))
}
desired_dat <-
data.frame(first_col = mapply(FUN = myFun, dat$label_1, dat$value_1, dat$label_2, dat$value_2, MoreArgs = list(which_label = 'first_col'), SIMPLIFY = TRUE),
second_col = mapply(FUN = myFun, dat$label_1, dat$value_1, dat$label_2, dat$value_2, MoreArgs = list(which_label = 'second_col'), SIMPLIFY = TRUE))
head(desired_dat)
first_col second_col
1 88 84
2 40 30
3 98 32
4 64 80
5 34 43
6 10 52

Manipulate Values of a Column Based on Another Column

I have a dataframe that looks as follows:
df <- read.table(header = TRUE, text =
"STUD_ID MEAS VALUE
1 LVEF(M-M) 69
1 LVEF(2D) 66
2 LVEF(2D) 36
2 LVEF(2D) 72
2 IVSD 63
3 LVEF(M-M) 50
4 LVEF(2D) 71
4 PASP 55", stringsAsFactors = FALSE)
I want to focus on MEAS that are either LVEF(M-M) or LVEF(2D).
If a STUD_ID has VALUEs that correspond to both LVEF(M-M) and LVEF(2D), then we only filter for the latter i.e. LVEF(2D).
If a STUD_ID has 2 VALUEs that correspond to LVEF(2D), then we take the mean.
My desired output is:
# STUD_ID MEAS VALUE
# 1 LVEF(2D) 66
# 2 LVEF(2D) 54
# 2 IVSD 63
# 3 LVEF(M-M) 50
# 4 LVEF(2D) 71
# 4 PASP 55
I tried the following but it gave me an error:
df %>%
filter(MEAS == "LVEF(M-M)" | MEAS == "LVEF(2D)") %>%
arrange(STUD_ID, MEAS) %>%
group_by(STUD_ID, MEAS) %>%
mutate(n = n()) %>%
group_by(STUD_ID) %>%
mutate(nd = n_distinct(MEAS)) %>%
mutate(VALUE =
case_when(nd == 2 ~ VALUE[which(MEAS == "LVEF(2D)")],
nd == 1 & n > 1 ~ mean(VALUE),
TRUE ~ VALUE)) %>%
bind_rows(
df %>% filter(MEAS != "LVEF(M-M)" & MEAS != "LVEF(2D)")
)
My dataframe contains other variables apart from this 3 variables and I wish to retain them.
This does what you are looking for...
df2 <- df %>% group_by(STUD_ID,MEAS) %>%
summarise(VALUE=mean(VALUE)) %>%
group_by(STUD_ID) %>% mutate(TEMP2D=("LVEF(2D)" %in% MEAS)) %>%
filter(!(MEAS=="LVEF(M-M)" & TEMP2D)) %>%
select(-TEMP2D)
df2
# A tibble: 6 x 3
# Groups: STUD_ID [4]
STUD_ID MEAS VALUE
<int> <chr> <dbl>
1 1 LVEF(2D) 66
2 2 IVSD 63
3 2 LVEF(2D) 54
4 3 LVEF(M-M) 50
5 4 LVEF(2D) 71
6 4 PASP 55

Resources