Sum by logical condition, in a dataset with groups - r

I have a df like this:
I want to transform the continuous Age variable into a discrete one, that is equal a if the original was between 1 and 2, and b if it was betweem 3 and 4. Thus needing to aggregate the values of Value 1 and Value 2 by summing the entries associated with Age=1 + Age=2 and Age=3 + Age=4. The output would be something like this:
The 146 is the sum of the Value1 entry for Age=1 (75) and Age=2 (71).
I thought on using aggregate:
`df2 = df %>% group_by(Sex, Race) %>%
summarise(across(starts_with("Value"), fun))
Where fun would be some function that checks the Age values and sum accordingly. But i'm not much familiar with these dplyr functions and couldn't get it to work. Thanks for the help!
Data:
df = structure(list(Sex = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 2, 2), Race = c(1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
2, 2), Age = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
), `Value 1` = c(75, 71, 52, 51, 24, 21, 70, 58, 67, 68, 36,
22, 91, 43, 33, 57), `Value 2` = c(22, 22, 49, 1, 20, 18, 34,
0, 27, 37, 31, 83, 29, 24, 10, 99)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))

We can use case_when to do the recoding of 'Age' based on the values
library(dplyr)
df %>%
group_by(Sex, Race, Age = case_when(Age %in% 1:2 ~ 'a',
Age %in% 3:4 ~ 'b')) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 8 x 5
# Sex Race Age `Value 1` `Value 2`
#* <dbl> <dbl> <chr> <dbl> <dbl>
#1 1 1 a 146 44
#2 1 1 b 103 50
#3 1 2 a 45 38
#4 1 2 b 128 34
#5 2 1 a 135 64
#6 2 1 b 58 114
#7 2 2 a 134 53
#8 2 2 b 90 109
Based on the OP's comment, if the original data have lots of categories, an easier option is cut or findInterval
df %>%
group_by(Sex, Race, Age = cut(Age, breaks = c(-Inf,
seq(0, 90, by = 5), Inf), labels = letters[1:20])) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')

Related

R Reshape and Select Max

HAVE = data.frame("WEEK"=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
"STUDENT"=c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3),
"CLASS"=c('A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H', 'A', 'A', 'B', 'B', 'C', 'C', 'H'),
"TEST"=c(1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1),
"SCORE"=c(93, 97, 72, 68, 93, 51, 19, 88, 56, 53, 79, 69, 90, 61, 74, 50, 76, 97, 55, 63, 63, 59, 68, 77, 80, 52, 94, 74, 64, 74, 92, 98, 89, 84, 54, 51, 82, 86, 51, 90, 72, 86))
WANT = data.frame("WEEK"=c(1,1,1,2,2,2),
"STUDENT"=c(1,2,3,1,2,3),
"CLASS"=c('A','A','B','B','B','C'),
"H"=c(19,61,63,74,54,86),
"TEST1"=c(93,88,76,77,92,90),
"TEST2"=c(97,56,97,80,98,72))
I wish to group by WEEK and STUDENT and then for each combination of WEEK and STUDENT find the CLASS when SCORE equals to maximum(SCORE) where TEST equals to one. Then I wish to find the corresponding SCORE for TEST equals to 2 using that same CLASS. I wish to transform this into the data WANT from the data HAVE. And ALSO add the COLUMN H where it is just equals to the SCORE when CLASS equals to H
We can reshape to 'wide' with pivot_wider, then grouped by 'WEEK', 'STUDENT', create the 'H' column with 'TEST1' values were 'CLASS' is "H" and then slice the max row for 'TEST1'
library(dplyr)
library(tidyr)
HAVE %>%
pivot_wider(names_from = TEST, values_from = SCORE,
names_glue = "TEST{TEST}") %>%
group_by(WEEK, STUDENT) %>%
mutate(H = TEST1[CLASS == "H"], .before = 'TEST1') %>%
slice_max(n = 1, order_by = TEST1, with_ties = FALSE) %>%
ungroup
-output
# A tibble: 6 × 6
WEEK STUDENT CLASS H TEST1 TEST2
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72
-checking with 'WANT'
> WANT
WEEK STUDENT CLASS H TEST1 TEST2
1 1 1 A 19 93 97
2 1 2 A 61 88 56
3 1 3 B 63 76 97
4 2 1 B 74 77 80
5 2 2 B 54 92 98
6 2 3 C 86 90 72

if previous value in one column and next value in another column meet a condition, add 1 into another column using r

I have data like this
structure(list(id = c(1, 1, 2, 2, 2), time = c(1834, 4809, 18,
333, 387), nh_source = c(0, 0, 1, 0, 0), admi_source = c(19,
19, 85, 19, 88), disdest = c(85, 29, 56, 85, 39)), class = "data.frame", row.names = c(NA,
-5L))
and I want to group the ids and check if the previous value in column disdest is 56 or 85 and the next value in column admisorc is 19, then add 1 to column nh_source column.I want the df to look like this
structure(list(id2 = c(1, 1, 2, 2, 2), time = c(1834, 4809, 18,
333, 387), nh_source2 = c(0, 1, 1, 1, 0), admi_source = c(19,
19, 85, 19, 88), disdest = c(85, 29, 56, 85, 39)), class = "data.frame", row.names = c(NA,
-5L))
Create the logical condition with lag after grouping by 'id' and add it to the 'nh_source' (TRUE -> 1 and FALSE -> 0)
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(nh_source = nh_source +
(admi_source == 19 & lag(disdest) %in% c(56, 85))) %>%
ungroup
-output
# A tibble: 5 x 5
id time nh_source admi_source disdest
1 1 1834 0 19 85
2 1 4809 1 19 29
3 2 18 1 85 56
4 2 333 1 19 85
5 2 387 0 88 39

Replacing NAs in columns with values from rows in a different dataframe in R that have the same ID

I have two dataframes:
deploy.info <- data.frame(Echo_ID = c("20180918_7.5Fa_1", "20180918_Sebre_3", "20190808_Bake_2", "20190808_NH_2"),
uppermost_bin = c(2, 7, 8, 12))
spc <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, "NA", "NA", 1, "NA"),
'20180918_Sebre_3' = c(1, 2, "NA", "NA", "NA", 4, "NA"),
'20190808_Bake_2' = c(1, 3, 7, "NA", "NA", 6, "NA"),
'20190808_NH_2' = c(1, 2, 8, "NA", "NA", 6, "NA"))
The last four columns in the spc data frame refer to each Echo_ID that I am dealing with in the deploy.info data frame. I want to replace the NAs in the spc data frame with the uppermost_bin values for each of the Echo_IDs. Does anyone know how to go about doing this?
My desired end product would look like:
i.want.this <- data.frame(species = c("RS", "GS", "YG", "RR", "BR", "GT", "CB"),
percent_dist = c(0, 25, 80, 100, 98, 60, 100),
percent_dist_from_surf = c(0, 25, 80, 100, 98, 60, 100),
'20180918_7.5Fa_1' = c(1, 1, 1, 2, 2, 1, 2),
'20180918_Sebre_3' = c(1, 2, 7, 7, 7, 4, 7),
'20190808_Bake_2' = c(1, 3, 7, 8, 8, 6, 8),
'20190808_NH_2' = c(1, 2, 8, 12, 12, 6, 12))
I have over 100 columns like this and would rather not go in and have to do this change by hand. Any ideas are greatly appreciated.
We can use Map to replace the NA elements in the columns of 'Echo_ID' by the corresponding values of 'uppermost_bin'. In the OP's dataset, the columns were factor, so it was converted to the correct type with type.convert
nm1 <- paste0("X", deploy.info$Echo_ID)
spc <- type.convert(spc, as.is = TRUE)
spc[nm1] <- Map(function(x, y) replace(x, is.na(x), y),
spc[nm1], deploy.info$uppermost_bin)
spc
# species percent_dist percent_dist_from_surf X20180918_7.5Fa_1 X20180918_Sebre_3 X20190808_Bake_2 X20190808_NH_2
#1 RS 0 0 1 1 1 1
#2 GS 25 25 1 2 3 2
#3 YG 80 80 1 7 7 8
#4 RR 100 100 2 7 8 12
#5 BR 98 98 2 7 8 12
#6 GT 60 60 1 4 6 6
#7 CB 100 100 2 7 8 12

Filtering dataframe by grouped replicate in R

I have the following data frame of an experiment with two replicates. I want to filter df based on score == 0 in both replicates for each timestamp & ID.
df <- data.frame(timestamp = c(1, 1, 1, 1, 2, 2, 2, 2),
ID = c(57, 57, 55, 55, 57, 57, 55, 55),
replicate= c(1, 2, 1, 2, 1, 2, 1, 2),
score = c(0, 1, 0, 0, 0, 1, 0, 0))
E.g. the desired output would be:
target <- data.frame(timestamp = c(1, 1, 2, 2),
ID = c(55, 55, 55, 55),
replicate = c(1, 2, 1, 2),
score = c(0, 0, 0, 0))
I've come up with a solution in a double-loop, which is inelegant and most likely inefficient:
tsvec <- df$timestamp %>% unique
idvec <- df$ID %>% unique
df_out <- c()
for(i in seq_along(tsvec)){ # loop along timestamps
innerdat <- df %>% filter(timestamp == tsvec[i])
for(j in seq_along(idvec)){ # loop along IDs
innerdat2 <- innerdat %>% filter(ID == idvec[j])
if(sum(innerdat2$score) == 0){
df_out <- rbind(df_out, innerdat2)
} else {
NULL
}
}
}
Does anybody have a dplyr way of making this more efficient?
library(dplyr)
df %>% group_by(ID) %>% filter(all(score==0))
# A tibble: 4 x 4
# Groups: ID [1]
timestamp ID replicate score
<dbl> <dbl> <dbl> <dbl>
1 1 55 1 0
2 1 55 2 0
3 2 55 1 0
4 2 55 2 0
An approach with data.table
library(data.table)
setDT(df)[, .SD[all(score == 0)], by = ID]

Calculate moving recency-weighted mean in R

I would like to calculate the moving recency-weighted mean finishing positions of a horse given the times (day) and finishing positions (pos) for a sequence of races in which the horse participated. Such statistics are useful in handicapping.
Currently, I am using a "loop-inside-a-loop" approach. Is there a faster or more elegant R-language approach to this problem?
#
# Test data
#
day <- c(0, 6, 10, 17, 21, 26, 29, 31, 34, 38, 41, 47, 48, 51, 61)
pos <- c(3, 5, 6, 1, 1, 3, 4, 1, 2, 2, 2, 6, 4, 5, 6)
testdata <- data.frame(id = 1, day = day, pos = pos, wt.pos = NA)
#
# No weight is given to observations earlier than cutoff
#
cutoff <- 30
#
# Rolling recency-weighted mean (wt.pos)
#
for(i in 2:nrow(testdata)) {
wt <- numeric(i-1)
for(j in 1:(i-1))
wt[j] <- max(0, cutoff - day[i] + day[j] + 1)
if (sum(wt) > 0)
testdata$wt.pos[i] <- sum(pos[1:j] * wt) / sum(wt)
}
> testdata
id day pos wt.pos
1 1 0 3 NA
2 1 6 5 3.000000
3 1 10 6 4.125000
4 1 17 1 4.931034
5 1 21 1 3.520548
6 1 26 3 2.632911
7 1 29 4 2.652174
8 1 31 1 2.954128
9 1 34 2 2.436975
10 1 38 2 2.226891
11 1 41 2 2.119048
12 1 47 6 2.137615
13 1 48 4 3.030534
14 1 51 5 3.303704
15 1 61 6 4.075000
I'd go for
# Calculate `wt` for all values of `i` in one go
wt <- lapply(2:nrow(testdata), function(i)
pmax(0, cutoff - day[i] + day[1:(i-1)] + 1))
# Fill in the column
testdata$wt.pos[-1] <- mapply(
function(i, w) if(sum(w) > 0) sum(pos[1:i]*w)/sum(w) else NA,
1:(nrow(testdata)-1), wt)
Note that by calculating the second argument to max for all values of j at the same time we have vectorized the computation, which improves the speed by many orders of magnitude.
I found no easy way to vectorize the outer loop and the if case though (apart from rewriting it in C which seems like overkill), but lapply, mapply and similar are still faster than for loops.
This version demonstrates how to calculate moving recency-weighted means for 1 or more variables (e.g., finishing position, speed rating, etc.) and 1 or more subjects (horses).
library(plyr)
day <- c(0, 6, 10, 17, 21, 26, 29, 31, 34, 38, 41, 47, 48, 51, 61)
pos <- c(3, 5, 6, 1, 1, 3, 4, 1, 2, 2, 2, 6, 4, 5, 6)
dis <- 100 + 0.5 * (pos - 1)
testdata1 <- data.frame(id = 1, day = day, pos = pos, dis = dis)
day <- c(0, 4, 7, 14, 22, 23, 31, 38, 42, 47, 52, 59, 68, 69, 79)
pos <- c(1, 3, 2, 6, 4, 5, 2, 1, 4, 5, 2, 1, 5, 5, 2)
dis <- 100 + 0.5 * (pos - 1)
testdata2 <- data.frame(id = 2, day = day, pos = pos, dis = dis)
testdata <- rbind(testdata1, testdata2)
# Moving recency-weighted mean
rollmean <- function(day, obs, cutoff = 90) {
obs <- as.matrix(obs)
wt <- lapply(2:nrow(obs), function(i)
pmax(0, cutoff - day[i] + day[1:(i-1)] + 1))
wt.obs <- lapply(1:(nrow(obs)-1), FUN =
function(i)
if(sum(wt[[i]]) > 0) {
apply(obs[1:i, , drop = F] * wt[[i]], 2, sum) / sum(wt[[i]])
} else {
rep(NA, ncol(obs))
}
)
answer <- rbind(rep(NA, ncol(obs)), do.call(rbind, wt.obs))
if (!is.null(dimnames(answer)))
dimnames(answer)[[2]] <- paste("wt", dimnames(answer)[[2]], sep = ".")
return(answer)
}
x <- dlply(testdata, .(id), .fun =
function(DF) rollmean(DF$day, DF[, c("pos", "dis"), drop = F])
)
y <- do.call(rbind, x)

Resources