data = data.frame(ID = 1:1000,
GROUP = factor(sample(1:5, rep = T)),
CAT = factor(sample(1:5, rep = T)),
DOG = factor(sample(1:5, rep = T)),
FOX = factor(sample(1:5, rep = T)),
MOUSE = factor(sample(1:5, rep = T)),
WEIGHT = round(runif(1000)*100,0)
)
data_WANT = data.frame(VARS = c("CAT", "DOG", "FOX", "MOUSE", "WEIGHT"),
GROUP1_N = NA,
GROUP1_PROP = NA,
GROUP2_N = NA,
GROUP2_PROP = NA,
GROUP3_N = NA,
GROUP3_PROP = NA,
GROUP4_N = NA,
GROUP4_PROP = NA,
GROUP5_N = NA,
GROUP5_PROP = NA)
I have a dataframe called 'data' and I wish to create a dataframe or datatable that presents the COUNT(_N) of each variable by GROUP and also the weighted proportion (_PROP) for each variable for each group using the variable WEIGHT in the dataframe called 'data'. This is a probability weight that is given to me to get representative estimates.
We can use data.table methods
library(data.table)
dcast(melt(setDT(type.convert(data, as.is = TRUE))[,
c(list(N = .N), lapply(.SD, weighted.mean, WEIGHT)),
GROUP, .SDcols = CAT:MOUSE], id.var = c('GROUP', 'N'),
variable.name = 'Animal'), Animal ~
paste0('GROUP_', GROUP), value.var = c('value', 'N'))
Perhaps, you are trying to do :
library(dplyr)
library(tidyr)
data %>%
type.convert(as.is = TRUE) %>%
group_by(GROUP) %>%
summarise(across(CAT:MOUSE, list(N = ~n(),
PROP = ~weighted.mean(., WEIGHT)))) %>%
pivot_longer(-GROUP,
names_to = c('Animal', 'prop'),
names_sep = '_') %>%
pivot_wider(names_from = c(GROUP, prop), values_from = value,
names_prefix = 'GROUP_')
# A tibble: 4 x 11
# Animal GROUP_1_N GROUP_1_PROP GROUP_2_N GROUP_2_PROP GROUP_3_N
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 CAT 200 5 200 1 200
#2 DOG 200 5 200 2 200
#3 FOX 200 1 200 3 200
#4 MOUSE 200 2 200 1 200
# … with 5 more variables: GROUP_3_PROP <dbl>, GROUP_4_N <dbl>,
# GROUP_4_PROP <dbl>, GROUP_5_N <dbl>, GROUP_5_PROP <dbl>
The pivot_longer and pivot_wider step is to get data in the same format as shown in data_WANT and they are not necessary to perform the calculation.
Related
I have this data frame;(df)
date Name Name_id x1 x2 x3 x4 x5 x6
01/01/2000 00:00 A U_12 1 1 1 1 1 1
01/01/2000 01:00 A U_12
01/01/2000 02:00
01/01/2000 03:00
....
I am trying to calculate the monthly aggregated mean etc. for some columns using lubridate.
what I did so far;
df$date <- dmy_hm(Sites_tot$date)
df$month <- floor_date(df$date,"month")
monthly_avgerage <- df %>%
group_by(Name, Name_id, month) %>%
summarize_at(vars(x1:x4), .funs = c("mean", "min", "max"), na.rm = TRUE)
I can see the values seem okay although some of the months are turned into NAs.
We can modify the summarise_at to
library(dplyr)
df %>%
group_by(Name, Name_id, month) %>%
summarise(across(x1:x4, list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))))
A reproducible example
iris %>%
group_by(Species) %>%
summarise(across(everything(), list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))))
If I am not wrong, the challenge is to get date column into datetime format:
Somehow date = dmy_hm(date) does not work:
library(dplyr)
library(lubridate)
df %>%
mutate(date = dmy_hms(paste0(date, ":00")),
month = month(date)) %>%
group_by(Name, Name_id, month) %>%
summarise(across(x1:x4, list(mean = ~ mean(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE))), .groups = "drop")
Name Name_id month x1_mean x1_min x1_max x2_mean x2_min x2_max x3_mean x3_min x3_max
<chr> <chr> <dbl> <dbl> <int> <int> <dbl> <int> <int> <dbl> <int> <int>
1 A U_12 1 1.5 1 2 1.5 1 2 1.5 1 2
2 B U_13 1 3.5 3 4 3.5 3 4 3.5 3 4
# … with 3 more variables: x4_mean <dbl>, x4_min <int>, x4_max <int>
# ℹ Use `colnames()` to see all variable names
fake data:
df <- structure(list(date = c("01/01/2000 00:00", "01/01/2000 01:00",
"01/01/2000 02:00", "01/01/2000 03:00"), Name = c("A", "A", "B",
"B"), Name_id = c("U_12", "U_12", "U_13", "U_13"), x1 = 1:4,
x2 = 1:4, x3 = 1:4, x4 = 1:4, x5 = 1:4, x6 = 1:4), class = "data.frame", row.names = c(NA,
-4L))
I have a list of questions, and I want to know how many rows have non-NA values using summarize. I want to use summarize because I'm already using that to calculate the average, which works in the below code. Why does the below code not work and how can I fix it?
library(dplyr)
test <- tibble(student = c("j", "c", "s"),
q1 = c(1, 2, 3),
q2 = c(NA_real_, NA_real_, 4),
q3 = c(43, NA_real_, 232))
test %>%
dplyr::summarise(n = across(starts_with("q"), ~n(.x)),
avg = across(contains("q"), ~ round(mean(.x, na.rm = T), 2)))
expected_outcome <- tibble(n_q1 = 3,
n_q2 = 1,
n_q3 = 2,
avg_q1 = 2,
avg_q2 = 4,
avg_q3 = 138)
library(dplyr)
test %>%
summarize(across(starts_with("q"), list(n = ~sum(!is.na(.)),
avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}"))
From the ?across documentation, you can pass a list to the .fns argument:
A list of functions/lambdas, e.g. list(mean = mean, n_miss = ~ sum(is.na(.x))
This will apply every function in that list to the columns you have specified. You can then use the .names argument of across to set the column names how you desire.
Output
n_q1 avg_q1 n_q2 avg_q2 n_q3 avg_q3
<int> <dbl> <int> <dbl> <int> <dbl>
1 3 2 1 4 2 138.
Update: Upps I missed the whole question. sorry: But here is an alternative just for fun: The preferred answer is already given by #LMc:
library(dplyr)
test %>%
summarise(across(starts_with("q"), list(avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}")) %>%
bind_cols(test %>% purrr::map_df(~sum(!is.na(.))))
avg_q1 avg_q2 avg_q3 student q1 q2 q3
<dbl> <dbl> <dbl> <int> <int> <int> <int>
1 2 4 138. 3 3 1 2
test %>%
summarise(across(starts_with("q"), list(avg = ~mean(., na.rm = T)),
.names = "{.fn}_{.col}")) %>%
bind_cols(test %>% purrr::map_df(~sum(!is.na(.))))
First not full answer:
To get the non-nas of the whole dataset, we could do this:
library(dplyr)
test %>%
purrr::map_df(~sum(!is.na(.)))
student q1 q2 q3
<int> <int> <int> <int>
1 3 3 1 2
help <- data.frame(
id = c(100, 100, 101, 102, 102),
q1 = c(NA, 1, NA, NA, 3),
q2 = c(1, NA, 2, NA, NA),
q3 = c(NA, 1, NA, 4, NA),
q4 = c(NA, NA, 4, NA, 5),
group = c("a", "b", "c", "a", "c"))
help$group <- as.character(help$group)
I am trying to pivot longer so dataset looks like this:
id score group
100 NA a
100 1 b
100 NA c
...
But I get an error with the numeric values of q1-q4 and the character string group.
pivot_longer(help, !id, names_to = "score",
values_to = "group", values_ptypes = list(group = 'character'))
Error: Can't convert <double> to <character>.
How can I pivot longer but also preserve the group variable (where there is several missing data for the q1-4 there is a match for every id and group)?
library(tidyr)
output <- pivot_longer(help, -c(id, group), names_to = "question",
values_to = "score") %>%
dplyr::select(-question) %>%
dplyr::arrange(id, group)
Output
head(output)
# A tibble: 6 × 3
id group score
<dbl> <chr> <dbl>
1 100 a NA
2 100 a 1
3 100 a NA
4 100 a NA
5 100 b 1
6 100 b NA
I want to use dplyr programming syntax (combine !! and :=) to evaluate a function in .fn argument but failed.
The code like this:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa = aa %>% group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = list(!!fun_name_1 := ~lag(., i), # ERROR OCCUR AT HERE
!!fun_name_2 := ~ rollmeanr(., i)),
.names = '{.col}_{.fn}'))
aa
}
I don't know how to solve it.
Any help will be highly appreciated!
======UPDATE========
My new code and new ERROR:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
# i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
aa
}
# Error: Problem with `mutate()` input `..1`.
# x 'names' attribute [6] must be the same length as the vector [5]
# i Input `..1` is `across(...)`.
# i The error occurred in group 1: region = 1.
# Run `rlang::last_error()` to see where the error occurred.
It would work as a named list. It makes perfect sense to pass a group by first (assuming that the OP's original example data have multiple rows per group)
i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
-output
# A tibble: 4 x 10
# Groups: region [4]
# region co_mean o3_mean pm2.5_mean co_mean_lag1 co_mean_lag01 o3_mean_lag1 o3_mean_lag01 pm2.5_mean_lag1 pm2.5_mean_lag01
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 5 5 5 NA 5 NA 5 NA 5
#2 2 5 5 5 NA 5 NA 5 NA 5
#3 3 5 5 5 NA 5 NA 5 NA 5
#4 4 5 5 5 NA 5 NA 5 NA
Could specify the fill = TRUE in rollmean
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i, fill = TRUE)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
First I don't think your data should be grouped, at least for the data shared it doesn't make sense to have only 1 row in the group and then calculate lag value and rolling mean on it.
You can have appropriate column names using .names in across and use map_dfc to combine everything into one dataframe.
library(dplyr)
library(purrr)
library(zoo)
map_dfc(1:3, function(x) {
aa %>%
transmute(across(.cols = contains('mean'),
.fns = list(lag = ~lag(., x),
lag0 = ~rollmeanr(., x, fill = NA)),
.names = sprintf('{fn}_{col}_%d', x)))
})
You can add group_by(Region) if you are trying it on some another dataset.
I have an example dataframe below.
example.df <- data.frame(
species = sample(c("primate", "non-primate"), 50, replace = TRUE),
treated = sample(c("Yes", "No"), 50, replace = TRUE),
gender = sample(c("male", "female"), 50, replace = TRUE),
var1 = rnorm(50, 100, 5), resp=rnorm(50, 10,5), value1 = rnorm (50, 25, 5))
I want to group by treated first, and then loop over all numeric variables in the data to perform a dunn test (pairw.kw from the asbio package) using species as the explanatory variable, extract the summary dataframe object and bind the columns from the yes and no sub-lists into a new dataframe object.
I have already obtained a partial solution here and here using a partially "tidy" approach which works quite well. I am just looking for a more elegant tidyverse solution in order to help me learn to be a better R user.
Any help is appreciated.
Edit: This is the output I have from the code in the partially "tidy" solution.
structure(list(var1.Diff = structure(1:2, .Label = c("-7.05229",
"-2.25"), class = "factor"), var1.Lower = structure(1:2, .Label =
c("-13.23198",
"-8.25114"), class = "factor"), var1.Upper = structure(1:2, .Label =
c("-0.87259",
"3.75114"), class = "factor"), var1.Decision = structure(1:2, .Label =
c("Reject H0",
"FTR H0"), class = "factor"), var1.Adj..P.value = structure(1:2, .Label =
c("0.025305",
"0.462433"), class = "factor"), resp.Diff = structure(1:2, .Label =
c("1.10458",
"0"), class = "factor"), resp.Lower = structure(1:2, .Label = c("-5.07512",
"-6.00114"), class = "factor"), resp.Upper = structure(1:2, .Label =
c("7.28427",
"6.00114"), class = "factor"), resp.Decision = structure(c(1L,
1L), .Label = "FTR H0", class = "factor"), resp.Adj..P.value =
structure(1:2, .Label = c("0.726092",
"1"), class = "factor"), effect.Diff = structure(1:2, .Label =
c("-1.27451",
"-0.5625"), class = "factor"), effect.Lower = structure(1:2, .Label =
c("-7.4542",
"-6.56364"), class = "factor"), effect.Upper = structure(1:2, .Label =
c("4.90518",
"5.43864"), class = "factor"), effect.Decision = structure(c(1L,
1L), .Label = "FTR H0", class = "factor"), effect.Adj..P.value =
structure(1:2, .Label = c("0.686047",
"0.85424"), class = "factor")), row.names = c("No", "Yes"), class =
"data.frame")
Update 2022/03/16
The tidyverse has evolved and so should this solution.
library("asbio")
#> Loading required package: tcltk
library("tidyverse")
# It is good practice to set the seed before simulating random data
# Otherwise we can't compare
set.seed(12345)
example.df <-
tibble(
species = sample(c("primate", "non-primate"), 50, replace = TRUE),
treated = sample(c("Yes", "No"), 50, replace = TRUE),
gender = sample(c("male", "female"), 50, replace = TRUE),
var1 = rnorm(50, 100, 5), resp = rnorm(50, 10, 5), value1 = rnorm(50, 25, 5)
) %>%
# Make sure species is a factor as required by `pair.kw`
mutate(
species = factor(species)
)
example.df %>%
# We want to perform the test separately for each group
group_by(
treated
) %>%
group_modify(
# Perform test and extract summary
~ pairw.kw(.x$var1, .x$species, conf = 0.95)$summary
)
#> # A tibble: 2 × 6
#> # Groups: treated [2]
#> treated Diff Lower Upper Decision `Adj. P-value`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 No -2.67949 -8.12299 2.76402 FTR H0 0.334663
#> 2 Yes 4.07071 -2.98047 11.12188 FTR H0 0.257843
It is straightforward to extend this approach to run 6 tests, one for each combination of a treatment group and a response variable (var1, value1 or resp). For example we can convert the tibble from wide format (three response columns) to narrow format (three responses stacked row-wise) and then proceed pretty much as above.
responses <- c("value1", "var1", "resp")
example.df %>%
pivot_longer(
all_of(responses),
names_to = "variable"
) %>%
group_by(
treated, variable
) %>%
group_modify(
~ pairw.kw(.x$value, .x$species, conf = 0.95)$summary
)
#> # A tibble: 6 × 7
#> # Groups: treated, variable [6]
#> treated variable Diff Lower Upper Decision `Adj. P-value`
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 No resp -1.46154 -6.90505 3.98197 FTR H0 0.598725
#> 2 No value1 4.62821 -0.8153 10.07171 FTR H0 0.095632
#> 3 No var1 -2.67949 -8.12299 2.76402 FTR H0 0.334663
#> 4 Yes resp 3.75758 -3.2936 10.80875 FTR H0 0.29627
#> 5 Yes value1 -2.97475 -10.02592 4.07643 FTR H0 0.408311
#> 6 Yes var1 4.07071 -2.98047 11.12188 FTR H0 0.257843
Created on 2022-03-16 by the reprex package (v2.0.1)
Old solution
Here is a tidy approach to running multiple tests simultaneously.
library("asbio")
#> Loading required package: tcltk
library("tidyverse")
# It is good practice to set the seed before simulating random data
# Otherwise can't compare
set.seed(12345)
example.df <- tibble(
species = sample(c("primate", "non-primate"), 50, replace = TRUE),
treated = sample(c("Yes", "No"), 50, replace = TRUE),
gender = sample(c("male", "female"), 50, replace = TRUE),
var1 = rnorm(50, 100, 5), resp=rnorm(50, 10,5), value1 = rnorm (50, 25, 5)) %>%
# Make sure species is a factor as required by `pair.kw`
mutate(species = factor(species))
example.df %>%
# Nest the data for each treatment group
nest(- treated) %>%
# Run the test on each treatment group
mutate(test = map(data, ~ pairw.kw(.$var1, .$species, conf = 0.95))) %>%
# There is no broom::tidy method for objects of class pairw
# but we can extract the summary ourselves
mutate(summary = map(test, pluck, "summary")) %>%
# Cast all the factor columns in the summary table to character, to
# avoid a warning about converting factors to characters.
mutate(summary = map(summary, mutate_if, is.factor, as.character)) %>%
unnest(summary)
#> # A tibble: 2 x 8
#> treated data test Diff Lower Upper Decision `Adj. P-value`
#> <chr> <list> <list> <chr> <chr> <chr> <chr> <chr>
#> 1 No <tibble [2~ <S3: pa~ -1.705~ -7.99~ 4.58~ FTR H0 0.595163
#> 2 Yes <tibble [2~ <S3: pa~ -1.145~ -6.45~ 4.16~ FTR H0 0.672655
It is straightforward to extend this approach to run 6 tests, one for each combination of a treatment group and a response variable (var1, value1 or resp). For example we can convert the tibble from wide format (three response columns) to narrow format (three responses stacked rowwise) and then proceed pretty much as above.
example.df %>%
# From wide format to narrow format
gather(varname, y, value1, var1, resp) %>%
# Nest the data for each treatment group and each variable
nest(- treated, - varname) %>%
# Run 6 tests = (number of treatments) * (number of response variables)
mutate(test = map(data, ~ pairw.kw(.$y, .$species, conf = 0.95))) %>%
# There is no broom::tidy method for objects of class pairw
# but we can extract the summary ourselves
mutate(summary = map(test, pluck, "summary")) %>%
# Cast all the factor columns in the summary table to character, to
# avoid a warning about converting factors to characters.
mutate(summary = map(summary, mutate_if, is.factor, as.character)) %>%
unnest(summary)
#> # A tibble: 6 x 9
#> treated varname data test Diff Lower Upper Decision `Adj. P-value`
#> <chr> <chr> <list> <list> <chr> <chr> <chr> <chr> <chr>
#> 1 No value1 <tibbl~ <S3: ~ 3.127~ -3.1~ 9.41~ FTR H0 0.329969
#> 2 Yes value1 <tibbl~ <S3: ~ -1.33~ -6.65 3.97~ FTR H0 0.622065
#> 3 No var1 <tibbl~ <S3: ~ -1.70~ -7.9~ 4.58~ FTR H0 0.595163
#> 4 Yes var1 <tibbl~ <S3: ~ -1.14~ -6.4~ 4.16~ FTR H0 0.672655
#> 5 No resp <tibbl~ <S3: ~ 1.421~ -4.8~ 7.71~ FTR H0 0.657905
#> 6 Yes resp <tibbl~ <S3: ~ 1.145~ -4.1~ 6.45~ FTR H0 0.672655
Created on 2019-03-04 by the reprex package (v0.2.1)
And what if you want to be flexible about the number and the name of responses? Then specify a list of responses:
responses <- c("value1", "var1", "resp")
And use tidy evaluation in the gather statement as follows:
example.df %>%
# From wide format to narrow format, with tidy evaluation
gather(varname, y, !!!rlang::syms(responses))
# Rest of computation here.....