Include column names as function input with dplyr - r

I often have to transform long tidy data frames into wide format. To do so I use the following standard procedure:
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
# Transform into wide format
df_wide <- df %>%
group_by_at(vars(-score)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=location, value=score) %>% # spread
dplyr::select(-row_id)
Instead of typing this little script over and over again, I wanted to define a function to do it automatically. I found many useful posts on how to include column names as function inputs, but somehow it doesn't work or I get error messages. What am I doing wrong?
Below a few of my attempts (neither of them work), following these and this suggestions:
wide_fun <- function(dat, key_name, value_name) {
group_by_at(vars(- !! sym(value_name))) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=!! sym(key_name), value=!! sym(value_name)) %>% # spread
dplyr::select(-row_id)
}
wide_fun2 <- function(dat, key_name, value_name) {
key_col <- enquo(key_name)
value_col <- enquo(value_name)
group_by_at(vars(- !!value_col)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key= !!key_col, value= !!value_col) %>% # spread
dplyr::select(-row_id)
}
wide_fun3 <- function(dat, key_name, value_name) {
group_by_at(vars(- value_name)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=key_name, value=value_name) %>% # spread
dplyr::select(-row_id)
}
wide_fun3(df, quote(location), quote(score))
Thanks for your help!

I've slightly updated your code to dplyr 1.0.0 and tidyr. Then you can make use of the new dplyr programming feature {{}} to specify variables that are arguments of a function.
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
library(dplyr)
wide_fun <- function(.data, key_name, value_name) {
.data %>%
group_by(across(-{{value_name}})) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = {{key_name}},
values_from = {{value_name}}) %>% # spread
select(-row_id)
}
wide_fun(df, location, score)
#> # A tibble: 5 x 5
#> ID a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 90.8 38.9 28.7 39.0
#> 2 2 94.5 24.9 84.6 54.6
#> 3 3 61.1 97.2 12.2 57.7
#> 4 4 52.7 85.6 41.4 100.
#> 5 5 17.8 86.1 92.3 33.7
Created on 2020-09-11 by the reprex package (v0.3.0)
Edit
This function should also work with older versions of dplyr:
library(dplyr)
wide_fun_2 <- function(.data, key_name, value_name) {
.data %>%
group_by_at(vars(-!!ensym(value_name))) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = !!ensym(key_name),
values_from = !!ensym(value_name)) %>% # spread
select(-row_id)
}
df %>%
wide_fun_2(location, score)
A tibble: 5 x 5
ID a b c d
<int> <dbl> <dbl> <dbl> <dbl>
1 1 72.2 81.4 52.5 48.8
2 2 36.1 27.5 82.2 73.0
3 3 83.9 68.2 80.9 15.7
4 4 0.451 70.0 18.5 43.2
5 5 82.6 68.2 22.8 63.0
If you just provide the argument that specifies the column, you only need to deal with symbols and not quosures, therefore you need to use ensym.

Related

Row bind multiple columns into two columns averaging unique elements in order

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to take a distinct on each V(X)_ID and average its associated V(X)_No columns.
V1_ID <- c('AUD','CAD','NZD','USD',NA,NA,NA)
V1_No <- c(3.43,2.42,1.58,9.9,NA,NA,NA)
V2_ID <- c('TRY','AUD','EUR','SPY','TRY','BTC',NA)
V2_No <- c(8.4,2.4,6.8,1.2,9.8,9.8,NA)
V3_ID <- c('JPY','EUR','NZD','AUD','SPY','NA',NA)
V3_No <- c(1.8,8.6,4.4,2.1,9.6,NA,NA)
V4_ID <- c('GBP','TRY','HKD','SKD','USD','NZD','CAD')
V4_No <- c(1.3,4.6,7.9,8.5,2.6,7.4,9.1)
df <- data.frame(V1_ID,V1_No,V2_ID,V2_No,V3_ID,V3_No,V4_ID,V4_No)
ID <- c('AUD','CAD','NZD','USD','TRY','EUR','SPY','BTC','JPY','GBP','SKD')
No <- c(2.643,5.76,4.46,6.25,7.6,8.6,5.4,9.8,1.8,1.3,8.5)
df2 <- data.frame(ID,No)
Your assistance is much appreciated as I have hundreds of these types of columns in this type of format and approaching it from a manual standpoint is very draining.
Thanks
A tidyverse solution that pivots to get all values in correct columns first:
library(tidyverse)
df |>
rownames_to_column() |>
pivot_longer(-rowname, names_to = c("run", "metric"), values_to = "val",
names_pattern = "V(.)_(..)", values_transform = as.character) |>
pivot_wider(id_cols = c(run, rowname), names_from = metric, values_from = val) |>
filter(!is.na(No)) |>
group_by(ID) |>
summarise(No = mean(as.double(No)))
#> # A tibble: 12 × 2
#> ID No
#> <chr> <dbl>
#> 1 AUD 2.64
#> 2 BTC 9.8
#> 3 CAD 5.76
#> 4 EUR 7.7
#> 5 GBP 1.3
#> 6 HKD 7.9
#> 7 JPY 1.8
#> 8 NZD 4.46
#> 9 SKD 8.5
#> 10 SPY 5.4
#> 11 TRY 7.6
#> 12 USD 6.25
One way to do this using some functions from the tidyverse:
library(tidyverse)
paste0("V", 1:4) %>%
map(~select(df, starts_with(.x))) %>% # split into list so that bind_rows() is easier
map(~rename(.x, v_id = 1, v_no = 2)) %>% # rename columns so bind_rows() combines all columns into two
bind_rows() %>%
drop_na() %>% # remove missing values
group_by(v_id) %>%
summarize(v_no = mean(v_no)) # calculate average by v_id
Since you say you have hundreds of these columns, you'll have to change paste0("V", 1:4) to match the number of columns you have. E.g., if you have 200 of them, you would just write paste0("V", 1:200).

How to combine function argument with group_by in R

I would like to use group_by( ) function with my customised function but the column names that goes within group_by would be defined in my function argument.
See a hypothetical example of what my data would look like:
data <- data.frame(ind = rep(c("A", "B", "C"), 4),
gender = rep(c("F", "M"), each = 6),
value = sample(1:100, 12))
And this is the result I would like to have:
result <- data %>%
group_by(ind, gender) %>%
mutate(value = mean(value)) %>%
distinct()
This is how I was trying to make my function to work:
myFunction <- function(data, set_group, variable){
result <- data %>%
group_by(get(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
distinct()
}
result3 <- myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
result3
I want to allow that the user define as many set_group as needed and as many variable as needed. I tried using get( ) function, all_of( ) function and mget( ) function within group_by but none worked.
Does anyone know how can I code it?
Thank you!
We could use across within group_by
myFunction <- function(data, set_group, variable){
data %>%
group_by(across(all_of(set_group))) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
Another option is to convert to symbols and evaluate (!!!)
myFunction <- function(data, set_group, variable){
data %>%
group_by(!!! rlang::syms(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
NOTE: get is used when there is a single object, for multiple objects mget can be used. But, it is better to use tidyverse functions

How calculate the mean and sd of FPKM gene counts by group and combind the mean and sd as dataframe?

Luckly, the first step of calculating the mean and sd by group has been finished. Now I have the mean and sd result respectively. And what I wanna do is how to combind theme togather. No matter how easy or difficult the combination method but should the combination dataframe be simple or not complicated.
Here I will show you my calculate method and the only combination method I knew. I nead the other new combination method. Plz. My sample data and code below:
data<-data.frame(matrix(sample(1:1000,500),20,25))
names(data) <- c(paste0("Gene_", 1:25))
rownames(data)<-NULL
data$Name<-c(rep(paste0("Group_",1:10),each=2))
unique(data$Name)
## 1 group_by, only get one result each time
mm <- data %>%
group_by(data$Name) %>%
summarise(mean=mean(Gene_1))
mm
## 2 tapply, can get the mean of each column , but only one column each time.
mm <- data.frame(mean_Gene_1=tapply(data[,"Gene_1"],data$Name,mean))
mm
## 3.aggregate, a powerful function , can get all the columns result by group.
mm <- aggregate(.~Name,data,mean)
mm
## get the mean and sd dataframe.
mean <- aggregate(.~Name,data,mean)
sd <- aggregate(.~Name,data,sd)
## now combine the two dataframe usingt the same index "Name" and "gene"
## I just learned one method from somebody in Stack overflow.
## combine the two file
data <- bind_rows(list(mean = mean, sd = sd), .id = "stat")
data_mean_sd <- data %>%
pivot_longer(-c(Name, stat), names_to = "Gene", values_to = "value") %>%
pivot_wider(names_from = "stat", values_from = "value")
You know the result is right. But it's a large file though it's a example here. It includes many duplicated data. I hope somebody give me a better method to simplify my result.
I need your help.
I am not sure, would the approach below work for you? The last part is basically the same using pivot_longer and pivot_wider, but for the summarise part I used dplyr::across.
library(dplyr)
library(tidyr)
data<-data.frame(matrix(sample(1:1000,500),20,25))
names(data) <- c(paste0("Gene_", 1:25))
rownames(data)<-NULL
data$Name<-c(rep(paste0("Group_",1:10),each=2))
data %>%
group_by(Name) %>%
summarise(across(everything(),
list(mean = ~ mean(.x),
sd = ~ sd(.x)),
.names = "{col}__{fn}")) %>%
pivot_longer(-c(Name), names_to = "Gene", values_to = "value") %>%
separate(., Gene, into = c("Gene", "Stats"), sep = "__") %>%
pivot_wider(names_from = Stats, values_from = "value")
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 250 x 4
#> Name Gene mean sd
#> <chr> <chr> <dbl> <dbl>
#> 1 Group_1 Gene_1 534. 556.
#> 2 Group_1 Gene_2 294. 51.6
#> 3 Group_1 Gene_3 262. 350.
#> 4 Group_1 Gene_4 615 338.
#> 5 Group_1 Gene_5 89 43.8
#> 6 Group_1 Gene_6 322 263.
#> 7 Group_1 Gene_7 696. 391.
#> 8 Group_1 Gene_8 182. 101.
#> 9 Group_1 Gene_9 582 139.
#> 10 Group_1 Gene_10 184 2.83
#> # ... with 240 more rows
Created on 2021-01-27 by the reprex package (v0.3.0)

R: pivot_wider() to extend a dataframe

I am having some issues ranslating a dataframe into wide format using pivot_wider. My dataframe looks like this:
Data <- read.table(header = T, text = "
ID A B C D
1 6.01764 0.00409222 0.000500143 101.816
1 6.01769 0.00431931 0.000565946 101.334
1 6.01774 0.00454617 0.00063163 101.923
2 6.01779 0.00477308 0.000697374 101.914
2 6.01784 0.00500005 0.000763118 101.905
2 6.0179 0.00522703 0.000828803 101.926
3 6.01795 0.005454 0.000894606 101.889
3 6.018 0.00568086 0.000960231 101.895
3 6.01805 0.00590783 0.00102603 101.87
")
I would like to create unique column names by combining The "ID" with the Column name so that it looks like this:
Datalong <- read.table(header = T, text = "
1A 1B 1C 1D 2A 2B 2C 2D 3A 3B 3C 3D
6.01764 0.00409222 0.000500143 101.816 6.01779 0.00477308 0.000697374 101.914 6.01795 0.005454 0.000894606 101.889
6.01769 0.00431931 0.000565946 101.334 6.01784 0.00500005 0.000763118 101.905 6.018 0.00568086 0.000960231 101.895
6.01774 0.00454617 0.00063163 101.923 6.0179 0.00522703 0.000828803 101.926 6.01805 0.00590783 0.00102603 101.87
")
I am thinking I might need to add a new column that counts each instance of the ID column (as it is time series data)
I have tried:
DataNew <- Data %>% pivot_wider(names_from = ID, values_from = c(ID, colnames(Data)))
And
Data %>% group_by(ID) %>% mutate(time = row_number()) %>% pivot_wider(names_from = time, values_from = c(ID, colnames(Data)))
but to no avail. Any support would be greatly appreciated!
How about this:
DataNew <- Data %>%
pivot_longer(-ID, names_to="var", values_to="vals") %>%
group_by(ID, var) %>%
mutate(obs =1:n(),
vnames = paste0(ID, var)) %>%
ungroup %>%
select(-c(ID, var)) %>%
pivot_wider(names_from = vnames, values_from = vals) %>%
select(-obs)
DataNew
# # A tibble: 3 x 12
# `1A` `1B` `1C` `1D` `2A` `2B` `2C` `2D` `3A` `3B`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 6.02 0.00409 5.00e-4 102. 6.02 0.00477 6.97e-4 102. 6.02 0.00545
# 2 6.02 0.00432 5.66e-4 101. 6.02 0.00500 7.63e-4 102. 6.02 0.00568
# 3 6.02 0.00455 6.32e-4 102. 6.02 0.00523 8.29e-4 102. 6.02 0.00591
# # … with 2 more variables: `3C` <dbl>, `3D` <dbl>
We can use dcast from data.table
library(data.table)
dcast(setDT(Data), rowid(ID) ~ ID, value.var = c('A', 'B', 'C', 'D'))

Using summarize_all with colMeans and colVar to create pivoted table in R

I want to use summarize_all on the following data and create my desired output, but I was curious how to do this the tidy way using some combination of mutate and summarize I think? Any help appreciated!!
dummy <- tibble(
a = 1:10,
b = 100:109,
c = 1000:1009
)
Desired Output
tibble(
Mean = colMeans(dummy[1:3]),
Variance = colVars(as.matrix(dummy[1:3])),
CV = Variance/Mean
)
Mean Variance CV
<dbl> <dbl> <dbl>
1 5.5 9.17 1.67
2 104. 9.17 0.0877
3 1004. 9.17 0.00913
It would be easier to reshape to 'long' format and then do it once after grouping by 'name'
library(dplyr)
library(tidyr)
pivot_longer(dummy, cols = everything()) %>%
group_by(name) %>%
summarise(Mean = mean(value), Variance = var(value), CV = Variance/Mean) %>%
select(-name)
# A tibble: 3 x 3
# Mean Variance CV
# <dbl> <dbl> <dbl>
#1 5.5 9.17 1.67
#2 104. 9.17 0.0877
#3 1004. 9.17 0.00913
Or either use summarise_all or summarise/across, but the output would be a single row, then do the reshaping
dummy %>%
summarise(across(everything(), list(Mean = mean,
Variance = var, CV = ~ mean(.)/var(.)))) %>%
pivot_longer(everything()) %>%
separate(name, into = c('name', 'name2')) %>%
pivot_wider(names_from = name2, values_from = value)

Resources