Related
I'd like to create a new data table which is the sum across rows from variables which contain a string. I have been trying to keep this within the tidyverse as a noob using new dplyr across. Help much appreciated.
dat<- data.frame("Image" = c(1,2,3,4),
"A" = c(1,2,3,4),
"A:B"= c(5,6,7,8),
"A:B:C"= c(9,10,11,12))
to obtain the sums across the rows of variables containing "A", "B", or "C".
datsums<- data.frame("Image" = c(1,2,3,4),
"Asum"= c(15,18,21,24),
"Bsum"=c(14,16,18,20),
"Csum"=c(9,10,11,12))
I have been unsuccessful using the newer dplyr verbs:
datsums<- dat %>% summarise(across(str_detect("A")), sum, .names ="Asum",
across(str_detect("B")), sum, .names="Bsum",
across(str_detect("C")), sum, .names"Csum")
use rowwise and c_across:
library(tidyverse)
dat %>%
rowwise() %>%
summarise(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
Returns:
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 3
Asum Bsum Csum
<dbl> <dbl> <dbl>
1 16 14 9
2 20 16 10
3 24 18 11
4 28 20 12
To add columns to the original data.frame, use mutate instead of summarise:
dat %>%
rowwise() %>%
mutate(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
# A tibble: 4 x 7
# Rowwise:
Image A A.B A.B.C Asum Bsum Csum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 5 9 16 14 9
2 2 2 6 10 20 16 10
3 3 3 7 11 24 18 11
4 4 4 8 12 28 20 12
Since you want row-wise sum you could use :
library(dplyr)
dat %>%
transmute(Asum = rowSums(select(., contains('A', ignore.case = FALSE))),
Bsum = rowSums(select(., contains('B', ignore.case = FALSE))),
Csum = rowSums(select(., contains('C', ignore.case = FALSE))))
Or for many variables use :
cols <- c('A', 'B', 'C')
purrr::map_dfc(cols, ~dat %>%
transmute(!!paste0(.x, 'sum') :=
rowSums(select(., contains(.x, ignore.case = FALSE)))))
# Asum Bsum Csum
#1 15 14 9
#2 18 16 10
#3 21 18 11
#4 24 20 12
use pivot_longer and pivot_wider
library(tidyverse)
dat %>%
pivot_longer(-Image) %>%
separate_rows(name, sep = "\\.") %>%
pivot_wider(Image,
names_from = name,
values_from = value,
values_fn = sum,
names_prefix = "sum")
#> # A tibble: 4 x 4
#> Image sumA sumB sumC
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 15 14 9
#> 2 2 18 16 10
#> 3 3 21 18 11
#> 4 4 24 20 12
Created on 2020-12-07 by the reprex package (v0.3.0)
I have a dataset with consistently named columns and I would like to take the average of the columns by their group e.g.,
library(dplyr)
library(purrr)
library(glue)
df <- tibble(`1_x_blind` = 1:3,
`1_y_blind` = 7:9,
`2_x_blind` = 4:6,
`2_y_blind` = 5:7)
df %>%
mutate(`1_overall_test` = rowMeans(select(., matches(glue("^1_.*_blind$")))))
#> # A tibble: 3 x 5
#> `1_x_blind` `1_y_blind` `2_x_blind` `2_y_blind` `1_overall_test`
#> <int> <int> <int> <int> <dbl>
#> 1 1 7 4 5 4
#> 2 2 8 5 6 5
#> 3 3 9 6 7 6
This method works fine. The next step for me would be to scale it so that I can do the entire series of columns e.g., something like
df %>%
mutate(overall_blind = map(1:2, ~rowMeans(select(., matches(glue("^{.x}_.*_blind$"))))))
#> Error: Problem with `mutate()` input `overall_blind`.
#> x no applicable method for 'select' applied to an object of class "c('integer', 'numeric')"
#> ℹ Input `overall_blind` is `map(1:2, ~rowMeans(select(., matches(glue("^{.x}_.*_blind$")))))`.
I think the problem here is that select is confusing the . operator. Is it possible to map over a series of column names in this way? Ideally I'd like the column names to follow the {.x}_overall pattern as in the example above.
Update Here's a cleaner way that doesn't require rename or bind_cols:
map_dfc(1:2,
function(x) df %>%
select(matches(glue("^{x}_.*_blind$"))) %>%
mutate("{x}_overall_blind" := rowMeans(.))
)
# A tibble: 3 x 6
`1_x_blind` `1_y_blind` `1_overall_blind` `2_x_blind` `2_y_blind` `2_overall_blind`
<int> <int> <dbl> <int> <int> <dbl>
1 1 7 4 4 5 4.5
2 2 8 5 5 6 5.5
3 3 9 6 6 7 6.5
Previous
Here's a map approach.
The challenge is mutating two new columns based on separate groups of existing columns. Easiest just to do that in its own map_dfc() and then bind that to the existing df.
df %>%
bind_cols(
map_dfc(1:2, ~rowMeans(df %>% select(matches(glue("^{.x}_.*_blind$"))))) %>%
rename_with(~paste0(str_replace(., "\\...", ""), "_overall_blind"))
)
# A tibble: 3 x 6
`1_x_blind` `1_y_blind` `2_x_blind` `2_y_blind` `1_overall_blind` `2_overall_blind`
<int> <int> <int> <int> <dbl> <dbl>
1 1 7 4 5 4 4.5
2 2 8 5 6 5 5.5
3 3 9 6 7 6 6.5
And here's a way to get your rowwise column-group averages using pivots, which avoids regex and mutate/map operations:
df %>%
mutate(row = row_number()) %>%
pivot_longer(-row) %>%
separate(name, c("grp"), sep = "_", extra = "drop") %>%
group_by(row, grp) %>%
summarise(overall_blind = mean(value)) %>%
ungroup() %>%
pivot_wider(id_cols = row, names_from = grp, values_from = overall_blind,
names_glue = "{grp}_{.value}") %>%
bind_cols(df)
# A tibble: 3 x 6
`1_overall_blind` `2_overall_blind` `1_x_blind` `1_y_blind` `2_x_blind` `2_y_blind`
<dbl> <dbl> <int> <int> <int> <int>
1 4 4.5 1 7 4 5
2 5 5.5 2 8 5 6
3 6 6.5 3 9 6 7
Here is one solution:
map_dfc(1:2, function(x) {
select(df, matches(glue("^{x}_.*_blind$"))) %>%
mutate(overall_blind = rowMeans(select(., matches(glue("^{x}_.*_blind$"))))) %>%
# General but not perfect names
# set_names(paste0(x, "_", names(.)))
# Hand-tailored names
set_names(c(names(.)[1], names(.)[2], paste0(x, "_", names(.)[3])))
})
#> # A tibble: 3 x 6
#> `1_x_blind` `1_y_blind` `1_overall_blind` `2_x_blind` `2_y_blind` `2_overall_blind`
#> <int> <int> <dbl> <int> <int> <dbl>
#> 1 1 7 4 4 5 4.5
#> 2 2 8 5 5 6 5.5
#> 3 3 9 6 6 7 6.5
I added two possibilities of naming the overall_blind column for each group, one more general but not perfect names (it duplicates the 1_ or 2_ for the data columns), and another that gives the names that you want but require knowing in advance the number of columns per group.
We can use split.default to split the data into list of datasets based on the column name pattern, then get the rowMeans and bind with the original data
library(dplyr)
library(purrr)
library(stringr)
df %>%
split.default(readr::parse_number(names(.))) %>%
map_dfc(rowMeans) %>%
set_names(str_c(names(.), "_overall_blind")) %>%
bind_cols(df, .)
# A tibble: 3 x 6
# `1_x_blind` `1_y_blind` `2_x_blind` `2_y_blind` `1_overall_blind` `2_overall_blind`
# <int> <int> <int> <int> <dbl> <dbl>
#1 1 7 4 5 4 4.5
#2 2 8 5 6 5 5.5
#3 3 9 6 7 6 6.5
This question is similar to R: How to extract a list from a dataframe?
But I could not implement it to my question in an easy way.
weird_df <- data_frame(col1 =c('hello', 'world', 'again'),col_weird = list(list(12,23), list(23,24), NA),col_weird2 = list(list(0,45), list(4,45),list(45,45.45,23)))
weird_df
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <list> <list>
1 hello <list [2]> <list [2]>
2 world <list [2]> <list [2]>
3 again <lgl [1]> <list [3]>
>
I want in the columns col_weirdand col_weird2 to only display the first value of the current list.
col1 col_weird col_weird2
1 hello 12 0
2 world 23 4
3 again NA 45
My real problem has a lot of columns.I tried this (altered acceptend answer in posted link)
library(tidyr)
library(purrr)
weird_df %>%
mutate(col_weird = map(c(col_weird,col_weird2), toString ) ) %>%
separate(col_weird, into = c("col1"), convert = TRUE) %>%
separate(col_weird2, into = c("col2",convert = T)
One solution would be to write a simple function that extracts the first value from each list in a vector of lists . This you can then apply to the relevant columns in your data frame.
library(tibble)
#create data
weird_df <- tibble(col1 =c('hello', 'world', 'again'),
col_weird = list(list(12,23), list(23,24), NA),
col_weird2 = list(list(0,45), list(4,45), list(45,45.45,23)))
#function to extract first values from a vector of lists
fnc <- function(x) {
sapply(x, FUN = function(y) {y[[1]]})
}
#apply function to the relevant columns
weird_df[,2:3] <- apply(weird_df[,2:3], MARGIN = 2, FUN = fnc)
weird_df
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <dbl> <dbl>
1 hello 12 0
2 world 23 4
3 again NA 45
Here is a dplyr solution
library(dplyr)
weird_df %>% mutate(across(c(col_weird, col_weird2), ~vapply(., `[[`, numeric(1L), 1L)))
Output
# A tibble: 3 x 3
col1 col_weird col_weird2
<chr> <dbl> <dbl>
1 hello 12 0
2 world 23 4
3 again NA 45
For example, I have a tidy data frame like this:
df <- tibble(id=1:2,
ctn=list(list(a="x",b=1),
list(a="y",b=2)))
# A tibble: 2 x 2
id ctn
<int> <list>
1 1 <list [2]>
2 2 <list [2]>
How could I unnest ctn column to the right so that the data frame will be like this:
# A tibble: 2 x 3
id a b
<int> <chr> <dbl>
1 1 x 1
2 2 y 2
With dplyr and purrr
df %>%
mutate(ctn = map(ctn, as_tibble)) %>%
unnest()
# A tibble: 2 x 3
id a b
<int> <chr> <dbl>
1 1 x 1
2 2 y 2
One option is
library(data.table)
setDT(df)[, unlist(ctn, recursive = FALSE), id]
# id a b
#1: 1 x 1
#2: 2 y 2
Or with tidyr
library(tidyverse)
df$ctn %>%
setNames(., df$id) %>%
bind_rows(., .id = 'id')
# A tibble: 2 x 3
# id a b
# <chr> <chr> <dbl>
#1 1 x 1
#2 2 y 2
In a tidy way we can now (dplyr 1.0.2 and above) do this using rowwise():
df %>% rowwise() %>% mutate(as_tibble(ctn))
# A tibble: 2 x 4
# Rowwise:
id ctn a b
<int> <list> <chr> <dbl>
1 1 <named list [2]> x 1
2 2 <named list [2]> y 2
And sticking to purrr we can also:
df %>% mutate(map_dfr(ctn, as_tibble))
# A tibble: 2 x 4
id ctn a b
<int> <list> <chr> <dbl>
1 1 <named list [2]> x 1
2 2 <named list [2]> y 2
I'm joining data frames (tibbles) that have duplicated columns that I do not want to join. Example below is what I would usually do (joining by i, but not a or b):
library(dplyr)
df1 <- tibble(i = letters[1:3], a = 1:3, b = 4:6)
df2 <- tibble(i = letters[1:3], a = 11:13, b = 14:16)
d <- full_join(df1, df2, by ="i")
d
#> # A tibble: 3 × 5
#> i a.x b.x a.y b.y
#> <chr> <int> <int> <int> <int>
#> 1 a 1 4 11 14
#> 2 b 2 5 12 15
#> 3 c 3 6 13 16
I want these duplicated variables to be returned as nested lists such as the output created below:
tibble(
i = letters[1:3],
a = list(c(1, 11), c(2, 12), c(3, 13)),
b = list(c(4, 14), c(5, 15), c(6, 16))
)
#> # A tibble: 3 × 3
#> i a b
#> <chr> <list> <list>
#> 1 a <dbl [2]> <dbl [2]>
#> 2 b <dbl [2]> <dbl [2]>
#> 3 c <dbl [2]> <dbl [2]>
Is there a simple way to do such a thing?
Aside, I've been playing around (unsuccessfully) with various stringr and tidyr methods. Here's an example that throws an error:
library(stringr)
library(tidyr)
# Find any variables with .x or .y
dup_var <- d %>% select(matches("\\.[xy]")) %>% names()
# Condense to the stems (original names) of these variables
dup_var_stems <- dup_var %>% str_replace("(\\.[x|y])+", "") %>% unique()
# For each stem, try to nest relevant data into a single variable
for (stem in dup_var_stems) {
d <- d %>% nest_(key_col = stem, nest_cols = names(d)[str_detect(names(d), paste0(stem, "[$|\\.]"))])
}
UPDATE
After answers from #Sotos and #conor, I'll mention that the solution needs to generalise to multiple joining and duplicated columns over many data frames. Below is an example where joining is done on five data frames by two columns (i and j). This creates five duplicated versions of columns a and b, with plenty of unique columns too c:g. One problem is that duplicating over so many data frames results in duplicated versions having no suffix, .x, .x.x, and so on. So simple regex match for .x|.y will miss the no-suffix version of the column.
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Here is one attempt,
library(dplyr)
library(tidyr)
melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value)) %>%
spread(a, new)
# A tibble: 3 × 3
# i a b
#* <chr> <list> <list>
#1 a <int [2]> <int [2]>
#2 b <int [2]> <int [2]>
#3 c <int [2]> <int [2]>
#With structure
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
#Or via reshape2 package
library(dplyr)
library(reshape2)
d1 <- melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value))
d2 <- dcast(d1, i ~ a, value.var = 'new')
#d2
# i a b
#1 a 1, 11 4, 14
#2 b 2, 12 5, 15
#3 c 3, 13 6, 16
#with structure:
str(d2)
'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
EDIT
To follow your thought,
library(dplyr)
library(reshape2)
library(purrr)
library(tidyr)
df <- melt(d, id.vars = c(names(d)[!grepl('a|b', names(d))]))
dots <- names(df)[!grepl('value', names(df))] %>% map(as.symbol)
df %>% mutate(variable = sub('\\..*', '', variable)) %>%
group_by_(.dots = dots) %>%
summarise(new = list(value)) %>%
spread(variable, new) %>%
ungroup()
# A tibble: 4 × 9
# i j c d e f g a b
#* <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Slightly more verbose than Sotos answer, but this will also work.
library(dplyr)
library(tidyr)
library(stringr)
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>% group_by(i, col) %>%
summarise(val = list(val)) %>%
spread(col, val) %>%
ungroup()
i a b
<fctr> <list> <list>
1 a <int [2]> <int [2]>
2 b <int [2]> <int [2]>
3 c <int [2]> <int [2]>
If you want to use nest to create lists of dataframes you can do this instead
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>%
group_by(i, col) %>%
nest(col) %>%
spread(col, data)
i a b
<fctr> <list> <list>
1 a <tbl_df [2,0]> <tbl_df [2,0]>
2 b <tbl_df [2,0]> <tbl_df [2,0]>
3 c <tbl_df [2,0]> <tbl_df [2,0]>
After updating the question, I arrived at the following based on the melt() solution provided by #Sotos (so please upvote that solution too if you think this works).
The following is a function that should take a data frame like the ones described, and nest duplicated columns. See comments throughout for explanation.
Create the problem data frame:
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Create function nest_duplicates()
# Function to nest duplicated columns after joining multiple data frames
#
# Args:
# df Data frame of joined data frames with duplicated columns.
# suffixes Character string to match suffixes. E.g., the default "\\.[xy]"
# finds any columns ending with .x or .y
#
# Depends on: dplyr, tidyr, purrr, stringr
nest_duplicated <- function(df, suffixes = "\\.[xy]") {
# Search string to match any duplicated variables
search_string <- df %>%
dplyr::select(dplyr::matches(suffixes)) %>%
names() %>%
stringr::str_replace_all(suffixes, "") %>%
unique() %>%
stringr::str_c(collapse = "|") %>%
stringr::str_c("(", ., ")($|", suffixes, ")")
# Gather duplicated variables and convert names to stems
df <- df %>%
tidyr::gather(variable, value, dplyr::matches(search_string)) %>%
dplyr::mutate(variable = stringr::str_replace_all(variable, suffixes, ""))
# Group by all columns except value to convert duplicated rows into list, then
# spread by variable (var)
dots <- names(df)[!stringr::str_detect(names(df), "value")] %>% purrr::map(as.symbol)
df %>%
dplyr::group_by_(.dots = dots) %>%
dplyr::summarise(new = list(value)) %>%
tidyr::spread(variable, new) %>%
dplyr::ungroup()
}
Apply nest_duplicates():
nest_duplicated(d)
#> # A tibble: 4 × 9
#> i j c d e f g a b
#> * <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#> 1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#> 2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#> 3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#> 4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Updates/improvements welcome!