Keeping excluded variables in summarise_at - r

I have a dataframe (this is just a subset of the full frame):
Depth <- seq(0, 2, 0.2)
cps <- sample(48000:52000, 11)
Al <- rnorm(11)
Si <- rnorm(11)
Fe <- rnorm(11)
df <- as_tibble(cbind(Depth, cps, Al, Si, Fe))
When I use mutate_at to perform a function for only chosen variables the final df still contains the variables I chose to exclude. So,
df_norm <- df %>%
mutate_at(vars(-c(Depth, cps)), ~abs(log(./df$cps)))
performs the function on Al, Si, Fe and df_norm is still a 11x5 tibble with Depth and cps being unchanged from df. However, when I do a similar move with summarise_at:
df_mean <- df %>%
summarise_at(vars(-c(Depth, cps)), mean)
the resulting dataframe is only 1x3 instead of 1x5 i.e. it removed Depth and cps instead of just ignoring them for the averaging. Is there a different way I should be writing the vars argument to keep these?
EDIT
I would like my output to be a single observation(vector) with all 5 variables [1,5] at the median Depth value (in this case 1).

In the devel version of dplyr, we can use summarise with across, but still not sure what values we want for 'Depth', 'cps', so it is converted to a list
library(dplyr)
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, list))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <list> <list>
#1 -0.438 -0.118 -0.590 <dbl [11]> <dbl [11]>
Or to get the first row
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, first))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590 0 51432
Or to subset the median element of 'Depth'
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, ~ .[Depth == median(Depth)]))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590 1 51753
If we need the first row, then mutate and slice the first row
df %>%
mutate_at(vars(-c(Depth, cps)), mean) %>%
slice(1)
# A tibble: 1 x 5
# Depth cps Al Si Fe
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 51432 -0.438 -0.118 -0.590
Or if it needs to be the median row
df %>%
mutate_at(vars(-c(Depth, cps)), mean) %>%
filter(Depth == median(Depth))
# A tibble: 1 x 5
# Depth cps Al Si Fe
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 51753 -0.438 -0.118 -0.590

Related

for loop in r -looping through vector

I am trying to loop through all the cols in my df and run a prop test on each of them.
library(gss)
To run on just one variable I can use--
infer::prop_test(gss,
college ~ sex,
order = c("female", "male"))
But now I want to run this for each variable in my df like this:
cols <- gss %>% select(-sex) %>% names(.)
for (i in cols){
# print(i)
prop_test(gss,
i~sex)
}
But this loop does not recognize the i;
Error: The response variable `i` cannot be found in this dataframe.
Any suggestions please??
We need to create the formula. Either use reformulate
library(gss)
library(infer)
out <- vector('list', length(cols))
names(out) <- cols
for(i in cols) {
out[[i]] <- prop_test(gss, reformulate("sex", response = i))
}
-output
> out
$college
# A tibble: 1 × 6
statistic chisq_df p_value alternative lower_ci upper_ci
<dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 0.0000204 1 0.996 two.sided -0.0917 0.101
$partyid
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 12.9 3 0.00484
$class
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 2.54 3 0.467
$finrela
# A tibble: 1 × 3
statistic chisq_df p_value
<dbl> <dbl> <dbl>
1 9.11 5 0.105
or paste
for(i in cols) {
prop_test(gss, as.formula(paste0(i, " ~ sex")))
}
data
library(dplyr)
data(gss)
cols <- gss %>%
select(where(is.factor), -sex, -income) %>%
names(.)

How do I reiterate through multiple variables

I have a sample dataset as below:
Day<-c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
Group<-c("A","A","A","B","B","B","C","C","C","A","A","A","A","B","B","B","C","C","C")
Rain<-c(4,4,6,5,3,4,5,5,3,6,6,6,5,3,3,3,2,5,2)
UV<-c(6,6,7,8,5,6,5,6,6,6,7,7,8,8,5,6,8,5,7)
dat<-data.frame(Day,Group,Rain,UV)
I want to run a Kruskal Wallis test among 'A','B' and 'C' in "Group" for the variables "Rain" and "UV".
At present, I am subsetting the variables one by one for Kruskal test as below:
dat_Rain<-dat%>%select(c(Day,Group,Rain))
library(rstatix)
library(tidyverse)
dat_Rain%>%
group_by(Day) %>%
kruskal_test(Rain ~ Group)
How do I reiterate Kruskal test for multiple variables (Rain,UV) in this dataset? Thanks.
You can define the columns that you want to apply kruskal_test and use map_df to get all the values in one dataframe.
library(rstatix)
library(tidyverse)
cols <- c('Rain', 'UV')
map_df(cols, ~dat %>% group_by(Day) %>% kruskal_test(reformulate('Group', .x)))
# Day .y. n statistic df p method
# <dbl> <chr> <int> <dbl> <int> <dbl> <chr>
#1 1 Rain 9 0.505 2 0.777 Kruskal-Wallis
#2 2 Rain 10 6.52 2 0.0384 Kruskal-Wallis
#3 1 UV 9 1.16 2 0.56 Kruskal-Wallis
#4 2 UV 10 0.423 2 0.809 Kruskal-Wallis
Using lapply and making use of a helper function this could be achieved like so:
Additionally I made use of bind_rows to bind the resulting list into one data frame.
Day<-c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
Group<-c("A","A","A","B","B","B","C","C","C","A","A","A","A","B","B","B","C","C","C")
Rain<-c(4,4,6,5,3,4,5,5,3,6,6,6,5,3,3,3,2,5,2)
UV<-c(6,6,7,8,5,6,5,6,6,6,7,7,8,8,5,6,8,5,7)
dat<-data.frame(Day,Group,Rain,UV)
library(rstatix)
library(tidyverse)
kt <- function(x, data) {
fmla <- as.formula(paste(x, "~ Group"))
data %>%
group_by(Day) %>%
kruskal_test(fmla)
}
lapply(c("Rain", "UV"), kt, data = dat) %>%
bind_rows()
#> # A tibble: 4 x 7
#> Day .y. n statistic df p method
#> <dbl> <chr> <int> <dbl> <int> <dbl> <chr>
#> 1 1 Rain 9 0.505 2 0.777 Kruskal-Wallis
#> 2 2 Rain 10 6.52 2 0.0384 Kruskal-Wallis
#> 3 1 UV 9 1.16 2 0.56 Kruskal-Wallis
#> 4 2 UV 10 0.423 2 0.809 Kruskal-Wallis

What is the best way to tidily append many columns?

I'm looking to append 30 columns which give values for gamma distributions by using the tidyverse. Here's an example of the data:
data.frame('rank'=1:3,'shape'=c(16,0.2,4),'rate'=c(13,0.4,0.2))
I'd like to use dgamma(1:30,shape,rate) to append 30 columns to the existing dataframe.
You can use map2() in purrr and unnest_wider() in tidyr.
library(tidyverse)
df %>%
mutate(density = map2(shape, rate, dgamma, x = 1:30)) %>%
unnest_wider(density, names_sep = "_")
Or use rowwise() at first and then mutate() with list().
df %>%
rowwise() %>%
mutate(density = list(dgamma(1:30, shape, rate))) %>%
unnest_wider(density, names_sep = "_")
Both of them give
# # A tibble: 3 x 33
# rank shape rate density_1 density_2 density_3 density_4 density_5 density_6 density_7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 16 13 1.15 0.0852 0.0000843 1.43e-8 9.16e-13 3.19e-17 7.28e-22
# 2 2 0.2 0.4 0.122 0.0468 0.0227 1.21e-2 6.77e- 3 3.92e- 3 2.32e- 3
# 3 3 4 0.2 0.000218 0.00143 0.00395 7.67e-3 1.23e- 2 1.73e- 2 2.26e- 2
# # … with 23 more variables: density_8 <dbl>, density_9 <dbl>, density_10 <dbl>, ..., density_30 <dbl>

Renaming multiple columns from vector in dplyr chain

I have a dataframe that I would like to rename several columns with similar name conventions (e.g., starts with "X") and/or column positions (e.g., 4:7). The new names of the columns are stored in a vector. How do I rename this columns in a dplyr chain?
# data
df <- tibble(RID = 1,Var1 = "A", Var2 = "B",old_name1 =4, old_name2 = 8, old_name3=20)
new_names <- c("new_name1","new_name2","new_name3")
#psuedo code
df %>%
rename_if(starts_with('old_name'), new_names)
An option with rename_at would be
df %>%
rename_at(vars(starts_with('old_name')), ~ new_names)
# A tibble: 1 x 6
# RID Var1 Var2 new_name1 new_name2 new_name3
# <dbl> <chr> <chr> <dbl> <dbl> <dbl>
#1 1.00 A B 4.00 8.00 20.0
But, it is possible to make a function that works with rename_if by creating a logical index on the column names
df %>%
rename_if(grepl("^old_name", names(.)), ~ new_names)
# A tibble: 1 x 6
# RID Var1 Var2 new_name1 new_name2 new_name3
# <dbl> <chr> <chr> <dbl> <dbl> <dbl>
#1 1.00 A B 4.00 8.00 20.0
The rename_if in general is checking at the values of the columns instead of the column names i.e.
new_names2 <- c('var1', 'var2')
df %>%
rename_if(is.character, ~ new_names2)
# A tibble: 1 x 6
# RID var1 var2 old_name1 old_name2 old_name3
# <dbl> <chr> <chr> <dbl> <dbl> <dbl>
#1 1.00 A B 4.00 8.00 20.0
Update dplyr 1.0.0
There is an addition to rename() by rename_with() which takes a function as input. This function can be function(x) return (new_names), in other words you use the purrr short form ~ new_names as the rename function.
This makes imho the most elegant dplyr expression.
# shortest & most elegant expression
df %>% rename_with(~ new_names, starts_with('old_name'))
# A tibble: 1 x 6
RID Var1 Var2 new_name1 new_name2 new_name3
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 A B 4 8 20

weighted mean in dplyr for multiple columns

I'm trying to calculate the weighted mean for multiple columns using dplyr. at the moment I'm stuck with summarize_each which to me seems to be part of the solution. here's some example code:
library(dplyr)
f2a <- c(1,0,0,1)
f2b <- c(0,0,0,1)
f2c <- c(1,1,1,1)
clustervar <- c("A","B","B","A")
weight <- c(10,20,30,40)
df <- data.frame (f2a, f2b, f2c, clustervar, weight, stringsAsFactors=FALSE)
df
what I am looking for is something like
df %>%
group_by (clustervar) %>%
summarise_each(funs(weighted.mean(weight)), select=cbind(clustervar, f2a:f2c))
The result of this is only:
# A tibble: 2 × 4
clustervar select4 select5 select6
<chr> <dbl> <dbl> <dbl>
1 A 25 25 25
2 B 25 25 25
What am I missing here?
You can use summarise_at to specify which columns you want to operate on:
df %>% group_by(clustervar) %>%
summarise_at(vars(starts_with('f2')),
funs(weighted.mean(., weight)))
#> # A tibble: 2 × 4
#> clustervar f2a f2b f2c
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 1 0.8 1
#> 2 B 0 0.0 1
We can reshape it to 'long' format and then do this
library(tidyverse)
gather(df, Var, Val, f2a:f2c) %>%
group_by(clustervar, Var) %>%
summarise(wt =weighted.mean(Val, weight)) %>%
spread(Var, wt)
Or another option is
df %>%
group_by(clustervar) %>%
summarise_each(funs(weighted.mean(., weight)), matches("^f"))
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
# 1 A 1 0.8 1
# 2 B 0 0.0 1
Or with summarise_at and matches (another variation of another post - didn't see the other post while posting)
df %>%
group_by(clustervar) %>%
summarise_at(vars(matches('f2')), funs(weighted.mean(., weight)))
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
#1 A 1 0.8 1
#2 B 0 0.0 1
Or another option is data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) weighted.mean(x, weight)),
by = clustervar, .SDcols = f2a:f2c]
# clustervar f2a f2b f2c
#1: A 1 0.8 1
#2: B 0 0.0 1
NOTE: All four answers are based on legitimate tidyverse/data.table syntax and would get the expected output
We can also create a function that makes use of the syntax from devel version of dplyr (soon to be released 0.6.0). The enquo does the similar job of substitute by taking the input arguments and converting it to quosures. Within the group_by/summarise/mutate, we evalute the quosure by unquoting (UQ or !!) it
wtFun <- function(dat, pat, wtcol, grpcol){
wtcol <- enquo(wtcol)
grpcol <- enquo(grpcol)
dat %>%
group_by(!!grpcol) %>%
summarise_at(vars(matches(pat)), funs(weighted.mean(., !!wtcol)))
}
wtFun(df, "f2", weight, clustervar)
# A tibble: 2 × 4
# clustervar f2a f2b f2c
# <chr> <dbl> <dbl> <dbl>
#1 A 1 0.8 1
#2 B 0 0.0 1

Resources