I have a data frame named df with five columns :
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data_frame(age, w, time, gender, Q2)
what I want is applying the weighted.mean based on each age to my data frame by using two conditions: 1)gender = 2 and 2) Q2 >=114 & Q2 <= 121
by the code below, I can simply apply weighted.mean but I do not know how to use my two conditions.
df1<-
df %>%
group_by(age) %>%
summarise(weighted_time = weighted.mean(time, weight))
Is the following what you are looking for?
library(tidyverse)
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
df %>%
group_by(age) %>%
filter(gender == 2 & Q2 >=114 & Q2 <= 121) %>%
summarise(weighted_time = weighted.mean(time, weight), .groups = "drop")
#> # A tibble: 3 × 2
#> age weighted_time
#> <dbl> <dbl>
#> 1 10 51.7
#> 2 11 28
#> 3 12 42.4
You can add a filter for those 2 (3) conditions:
df %>% filter(gender == 2 & Q2 >= 114 & Q2 <= 121) %>% group_by(age) %>% summarise(weighted_time = weighted.mean(time, weight))
This gives
# A tibble: 3 x 2
age weighted_time
<dbl> <dbl>
1 10 51.7
2 11 28
3 12 42.4
data.table
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
library(data.table)
setDT(df)[gender == 2 & (Q2 >=114 & Q2 <= 121), list(res = weighted.mean(time, weight)), by = age
][order(age)]
#> age res
#> 1: 10 51.71739
#> 2: 11 28.00000
#> 3: 12 42.42219
Created on 2021-12-10 by the reprex package (v2.0.1)
Related
I have a grouped data frame which I want to summarise into "count of values less than x, y, z by group". I can manually generate the wide dataframe I want using code similar to this below
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
manual <- df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
However, I'd like to be able to define a list of "less thans" and generate these columns by referring to a list. I've done something similar in the past, though using enframe(quantile()) to generate a long list of quantiles before pivoting
pc <- c(0.1, 0.5, 0.9)
quantiles <- df %>%
group_by(group) %>%
summarise(enframe(quantile(num, pc))) %>%
pivot_wider(
id_cols = group,
names_from = name,
values_from = value
)
But I don't know / understand the way to define a custom function within the enframe(). Ideally I'd like to apply this in something like the code below (though this obviously doesn't work), with or without the pivot step, in order to get back to the same output as "manual"
levels <- c(50, 100, 150)
programmatic <- df %>%
group_by(group) %>%
summarise(cols = ("less_than", x), num < levels) %>%
pivot...
Any help greatly appreciated
One way you could do it:
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
less_than <- function(x) {
df %>%
group_by(group) %>%
summarise(less_than_ = sum(num < x)) %>%
rename_with(~ str_c(., x), .cols = -group)
}
levels <- c(50, 100, 150)
map_dfr(levels, less_than) |>
group_by(group) |>
summarise(across(everything(), mean, na.rm = TRUE))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <dbl> <dbl> <dbl>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
# Manual result for comparison
df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <int> <int> <int>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
Created on 2022-06-06 by the reprex package (v2.0.1)
I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)
I am trying to find a way to rename my factor levels (1, 2, 3) with girl, boy, other in the dplyr tibble output.
This is the code:
library(dplyr)
df1 %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
And my result is:
# A tibble: 3 x 3
sexs percent n
<int> <dbl> <int>
1 1 52.1 731
2 2 47.1 661
3 NA 0.855 12
The desired result would be:
# A tibble: 3 x 3
sexs percent n
<int> <dbl> <int>
Girl 1 52.1 731
Boy 2 47.1 661
Other NA 0.855 12
I happen to love the forcats package because when I get done I can actually see what I did. Another solution by simply adding to the pipe before your existiung code.
library(dplyr)
library(forcats)
sex <- sample(1:2, 100, replace = TRUE)
sex[[88]] <- NA
df1 <- data.frame(sex)
df1 %>%
mutate(newsex = fct_explicit_na(fct_recode(as_factor(sex),
Girl = "1",
Boy = "2" ),
na_level = "Other")) %>%
group_by(newsex, sex) %>%
summarise(percent = 100 * n() / nrow(df1), n=n())
#> # A tibble: 3 x 4
#> # Groups: newsex [3]
#> newsex sex percent n
#> <fct> <int> <dbl> <int>
#> 1 Girl 1 56 56
#> 2 Boy 2 43 43
#> 3 Other NA 1 1
Created on 2020-05-11 by the reprex package (v0.3.0)
When posting please provide some sample data to work with, it will help others test and make sure everything is working properly. This problem is relatively simple so it shouldn't be a problem.
If you want to replace the NA with literally any other number you can do this
df1 %>%
dplyr::mutate(sex = ifelse(is.na(sex), 0, sex),
sex = factor(sex,
levels = c(1,2,0),
labels = c("Girl", "Boy", "Other"))) %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
Otherwise you can use case_when to assign the factors and then convert the column to a factor
df1 %>%
dplyr::mutate(sex = case_when(
sex == 1 ~ "Girl",
sex == 2 ~ "Boy",
is.na(sex) ~ "Other") %>%
as_factor(.)) %>%
dplyr::group_by(sex)%>%
dplyr::summarise(percent=100*n()/nrow(df1), n=n())
My data is something like this:
group <- c(21, 21, 21, 9, 9, 9, 25, 25, 25)
a <- c(8,3,5,6,8,3,3,9,3)
b <- c(4,9,0,1,3,5,6,1,1)
c <- c(1,7,2,5,6,8,4,8,6)
value <- c(23,34,43,52,65,21,12,89,76)
df <- data.frame(group,a,b,c,value)
I applied following function to it.
out <- df %>%
select(group, a, b, value) %>%
group_by(group = gl(n()/3, 3)) %>%
summarise(res = mean(value), a=a[1], b=b[1])
print(out)
Then I am getting following result.
group res a b
<fct> <dbl> <dbl> <dbl>
1 1 33.3 8 4
2 2 46 6 1
3 3 59 3 6
>
My question is how to keep the orgiignal values of ID as they were in the output df like this
group res a b
<fct> <dbl> <dbl> <dbl>
1 21 33.3 8 4
2 9 46 6 1
3 25 59 3 6
>
Thanks in advance!
The issue is you are overwriting your group variable in group_by call hence you are not getting the original variable. You need to use some other name in group_by and then do the calculations.
We can use two options -
1) With summarise
library(dplyr)
df %>%
group_by(group1 = gl(n()/3, 3)) %>%
summarise(res = mean(value), a=a[1], b=b[1], group = group[1])
# group1 res a b group
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 1 33.3 8 4 21
#2 2 46 6 1 9
#3 3 59 3 6 25
2) With mutate
df %>%
select(group, a, b, value) %>%
group_by(group1 = gl(n()/3, 3)) %>%
mutate(res = mean(value), a=a[1], b=b[1]) %>%
slice(1)
In both the case, if you are no longer interested in keeping the grouping variable do ungroup() %>% select(-group1) to remove it.
First generating some sample data:
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
doy.range <- c(40,50,60,80)
thres <- 200
df <- data.frame(cbind(doy,year,value))
What I want to do is the following:
For the df$year == 2000, starting from doy.range == 40, start adding the
df$value and calculate the df$doy when the cumualtive sum of df$value is >= thres
Here's my long for loop to achieve this:
# create a matrix to store results
mat <- matrix(, nrow = length(doy.range)*length(unique(year)),ncol=3)
mat[,1] <- rep(unique(year),each=4)
mat[,2] <- rep(doy.range,times=2)
for(i in unique(df$year)){
dat <- df[df$year== i,]
for(j in doy.range){
dat1 <- dat[dat$doy >= j,]
dat1$cum.sum <-cumsum(dat1$value)
day.thres <- dat1[dat1$cum.sum >= thres,"doy"][1] # gives me the doy of the year where cumsum of df$value becomes >= thres
mat[mat[,2] == j & mat[,1] == i,3] <- day.thres
}
}
This loop gives me the in the third column of my matrix, the doy when cumsum$value exceeded thres
However, I really want to avoid the loops. Is there any way I can do it using less code?
If I understand correctly you can use dplyr. Assume a threshold of 200:
library(dplyr)
df %>% group_by(year) %>%
filter(doy >= 40) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
which yields
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
The verbs used are self-explanatory I guess. If not, let me know.
For different doy create a function and use lapply:
f <- function(doy.range) {
df %>% group_by(year) %>%
filter(doy >= doy.range) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
}
lapply(doy.range, f)
[[1]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
[[2]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 89 2000 2.454885 200.2998
2 91 2001 6.578281 200.6544
[[3]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 98 2000 4.100841 200.5048
2 102 2001 7.158333 200.3770
[[4]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 120 2000 6.401010 204.9951
2 120 2001 5.884192 200.8252
The idea is to create a function that based on a given (starting) doy and threshold gets you the relevant info. Then apply this function to different combinations of starting doys and thresholds and get a dataset back with all relevant info:
# create example data
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
df <- data.frame(doy,year,value)
library(dplyr)
library(purrr)
# function (inputs: dr for doy range and t for threshold)
f = function(dr, t) {
df %>%
filter(doy >= dr) %>% # keep rows with values aboven a given doy
group_by(year) %>% # for each year
mutate(CumSumValue = cumsum(value)) %>% # get the cumulative sum of value
filter(CumSumValue >= t) %>% # keep rows equal or above a given threshold
slice(1) %>% # keep the first row
ungroup() %>% # forget the grouping
select(-value) %>% # remove unnecessary variable
mutate(doy_input=dr, thres_input=t) %>% # add the input info as columns
select(doy_input, thres_input, year, doy, CumSumValue) # re arrange columns
}
# input doy and threshold
doy.range <- c(40,50,60,80)
thres <- 200
# map those vectors to the function
map2_df(doy.range, thres, f)
# # A tibble: 8 x 5
# doy_input thres_input year doy CumSumValue
# <dbl> <dbl> <int> <int> <dbl>
# 1 40 200 2000 78 201.4864
# 2 40 200 2001 75 204.3171
# 3 50 200 2000 89 200.2998
# 4 50 200 2001 91 200.6544
# 5 60 200 2000 98 200.5048
# 6 60 200 2001 102 200.3770
# 7 80 200 2000 120 204.9951
# 8 80 200 2001 120 200.8252