I have this tibble which has a list column with vectors in them
df <- data_frame(grp = c("A", "A", "B", "B"),
x = rep(c(list(c(1,2,3)), list(c(4,5,6))), 2))
What I would like to do (preferably within tidyverse) is to perform element wise addition of the vectors inside the lists,
essentially:
c(1,2,3) + c(4,5,6)
# [1] 5 7 9
This:
# A tibble: 4 × 2
grp x
<chr> <list>
A list(c(1,2,3))
A list(c(4,5,6))
B list(c(1,2,3))
B list(c(4,5,6))
Becomes:
# A tibble: 2 × 2
grp y
<chr> <list>
A list(c(5,7,9))
B list(c(5,7,9))
What might be a good approach?
The following should also get you what you need:
dff %>% group_by(grp) %>%
summarise(x = list(Reduce("+",x))) %>%
ungroup()
I hope this helps.
We can try
library(dplyr)
library(tidyr)
r1 <- lengths(df$x)[1]
unnest(df) %>%
group_by(grp) %>%
mutate(grp1 = rep(seq(r1), 2)) %>%
group_by(grp1, add = TRUE) %>%
summarise(x = sum(x)) %>%
group_by(grp) %>%
summarise(x= list(x))
# A tibble: 2 × 2
# grp x
# <chr> <list>
#1 A <dbl [3]>
#2 B <dbl [3]>
Related
I am using purrr and want to use map or on a list list1 within a tibble my_tibble in combination with an external dataframe my_dataframe, the idea is for my_dataframe to be merged to every item of list1:
library(dplyr)
library(purrr)
df1 <- tibble(X = 1:3, A = c("a", "b", "c"))
df2 <- tibble(X = 1:3, A = c("d", "e", "f"))
df3 <- tibble(X = 1:3, A = c("x", "y", "z"))
my_tibble <- tibble (list1 = list(df1, df2, df3), list2 = list(df1, df2, df3))
my_dataframe <- tibble(D = 1:9, A = c("d", "e", "f","a", "b", "c","x", "y", "z"))
my_tibble <- my_tibble %>% mutate (list1 = map (list1, function (.x) {
.x %>% left_join(my_dataframe) } ))
The example actually works so no answer is needed
Actually you need to mutate every column of my_tibble (if I understand you correct as left_join with my_dataframe. So you have to use mutate(across... along with map as -
my_tibble %>%
mutate(across(everything(), ~map(., function(.x) .x %>% left_join(my_dataframe, by = c('X' = 'D')))))
# A tibble: 3 x 2
list1 list2
<list> <list>
1 <tibble[,3] [3 x 3]> <tibble[,3] [3 x 3]>
2 <tibble[,3] [3 x 3]> <tibble[,3] [3 x 3]>
3 <tibble[,3] [3 x 3]> <tibble[,3] [3 x 3]>
Check
my_tibble %>%
mutate(across(everything(), ~map(., function(.x) .x %>% left_join(my_dataframe, by = c('X' = 'D'))))) -> result
result$list1[[1]]
# A tibble: 3 x 3
X A D
<int> <chr> <int>
1 1 a 4
2 2 b 5
3 3 c 6
This will work as lambda function too
my_tibble %>%
mutate(across(everything(), ~map(., ~.x %>% left_join(my_dataframe, by = c('X' = 'D')))))
This can also be done without purrr package function, as it looked pretty straightforward to me and thought you might be interested:
library(dplyr)
library(tidyr)
my_tibble %>%
mutate(id = row_number()) %>%
unnest(list1) %>%
left_join(my_dataframe, by = "A") %>%
group_by(id) %>%
nest(data = c(X, A, D)) %>%
rename(list1 = data) %>%
ungroup() %>%
select(-id) %>%
relocate(list1)
# A tibble: 3 x 2
list1 list2
<list> <list>
1 <tibble [3 x 3]> <tibble [3 x 2]>
2 <tibble [3 x 3]> <tibble [3 x 2]>
3 <tibble [3 x 3]> <tibble [3 x 2]>
I am unable to get grouped sum in one single step using nest but in 2 steps. How can I use map to loop over data column in the output of nest(). Also suggest a way to include the output column in the existing dataframe.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df1 = tmp_df %>% group_by(group) %>% nest() #step1
map(tmp_df1$data, sum) #step 2
#> [[1]]
#> [1] -5
#>
#> [[2]]
#> [1] 5
I know how to get the sum using group_by.
suppressWarnings(library(tidyverse))
tmp_df <-
data.frame(group = rep(c(2L, 1L), each = 5), b = rep(c(-1, 1), each = 5))
tmp_df %>%
group_by(group) %>%
summarise(sum = sum(b))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> group sum
#> <int> <dbl>
#> 1 1 5
#> 2 2 -5
Created on 2020-08-04 by the reprex package (v0.3.0)
We can use c_across
library(dplyr)
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(c_across(data)))
-output
# A tibble: 2 x 3
# Rowwise: group
# group data sum
# <int> <list<tbl_df[,1]>> <dbl>
#1 1 [5 × 1] 5
#2 2 [5 × 1] -5
Or just
tmp_df %>%
nest_by(group) %>%
mutate(sum = sum(data))
If you want to use nest you can try map_dbl :
library(tidyverse)
tmp_df %>%
group_by(group) %>%
nest() %>%
mutate(sum = map_dbl(data, ~.x %>% sum))
# group data sum
# <int> <list> <dbl>
#1 2 <tibble [5 × 1]> -5
#2 1 <tibble [5 × 1]> 5
I have a dataset like so:
df<-data.frame(x=c("A","A","A","A", "B","B","B","B","B",
"C","C","C","C","C","D","D","D","D","D"),
y= as.factor(c(rep("Eoissp2",4),rep("Eoissp1",5),"Eoissp1","Eoisp4","Automerissp1","Automerissp2","Acharias",rep("Eoissp2",3),rep("Eoissp1",2))))
I want to identify, for each subset of x, the corresponding levels in y that are entirely duplicates containing the expression Eois. Therefore, A , B, and D will be returned in a vector because every level of A , B, and D contains the expression Eois , while level C consists of various unique levels (e.g. Eois, Automeris and Acharias). For this example the output would be:
output<- c("A", "B", "D")
Using new df:
> df %>% filter(str_detect(y,"Eois")) %>% group_by(x) %>% distinct(y) %>%
count() %>% filter(n==1) %>% select(x)
# A tibble: 2 x 1
# Groups: x [2]
x
<fct>
1 A
2 B
(Answer below uses the original df posted by the question author.)
Using the pipe function in magrittr & functions from dplyr:
> df %>% group_by(x) %>% distinct(y)
# A tibble: 7 x 2
# Groups: x [3]
x y
<fct> <fct>
1 A plant1a
2 B plant1b
3 C plant1a
4 C plant2a
5 C plant3a
6 C plant4a
7 C plant5a
Then you can roll up the results like this:
> results <- df %>% group_by(x) %>% distinct(y) %>%
count() %>% filter(n==1) %>% select(x)
> results
# A tibble: 2 x 1
# Groups: x [2]
x
<fct>
1 A
2 B
If you know your original data frame is always going to come with the x's in order, you can drop the group_by part.
A dplyr based solution could be as:
library(dplyr)
df %>% group_by(x) %>%
filter(grepl("Eoiss", y)) %>%
mutate(y = sub("\\d+", "", y)) %>%
filter(n() >1 & length(unique(y)) == 1) %>%
select(x) %>% unique(.)
# A tibble: 3 x 1
# Groups: x [3]
# x
# <fctr>
#1 A
#2 B
#3 D
Data
df<-data.frame(x=c("A","A","A","A", "B","B","B","B","B",
"C","C","C","C","C","D","D","D","D","D"),
y= as.factor(c(rep("Eoissp2",4),
rep("Eoissp1",5),"Eoissp1","Eoisp4","Automerissp1","Automerissp2",
"Acharias",rep("Eoissp2",3),rep("Eoissp1",2))))
Using the tidyverse a lot i often face the challenge of turning named vectors into a data.frame/tibble with the columns being the names of the vector.
What is the prefered/tidyversey way of doing this?
EDIT: This is related to: this and this github-issue
So i want:
require(tidyverse)
vec <- c("a" = 1, "b" = 2)
to become this:
# A tibble: 1 × 2
a b
<dbl> <dbl>
1 1 2
I can do this via e.g.:
vec %>% enframe %>% spread(name, value)
vec %>% t %>% as_tibble
Usecase example:
require(tidyverse)
require(rvest)
txt <- c('<node a="1" b="2"></node>',
'<node a="1" c="3"></node>')
txt %>% map(read_xml) %>% map(xml_attrs) %>% map_df(~t(.) %>% as_tibble)
Which gives
# A tibble: 2 × 3
a b c
<chr> <chr> <chr>
1 1 2 <NA>
2 1 <NA> 3
This is now directly supported using bind_rows (introduced in dplyr 0.7.0):
library(tidyverse))
vec <- c("a" = 1, "b" = 2)
bind_rows(vec)
#> # A tibble: 1 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 2
This quote from https://cran.r-project.org/web/packages/dplyr/news.html explains the change:
bind_rows() and bind_cols() now accept vectors. They are treated as rows by the former and columns by the latter. Rows require inner names like c(col1 = 1, col2 = 2), while columns require outer names: col1 = c(1, 2). Lists are still treated as data frames but can be spliced explicitly with !!!, e.g. bind_rows(!!! x) (#1676).
With this change, it means that the following line in the use case example:
txt %>% map(read_xml) %>% map(xml_attrs) %>% map_df(~t(.) %>% as_tibble)
can be rewritten as
txt %>% map(read_xml) %>% map(xml_attrs) %>% map_df(bind_rows)
which is also equivalent to
txt %>% map(read_xml) %>% map(xml_attrs) %>% { bind_rows(!!! .) }
The equivalence of the different approaches is demonstrated in the following example:
library(tidyverse)
library(rvest)
txt <- c('<node a="1" b="2"></node>',
'<node a="1" c="3"></node>')
temp <- txt %>% map(read_xml) %>% map(xml_attrs)
# x, y, and z are identical
x <- temp %>% map_df(~t(.) %>% as_tibble)
y <- temp %>% map_df(bind_rows)
z <- bind_rows(!!! temp)
identical(x, y)
#> [1] TRUE
identical(y, z)
#> [1] TRUE
z
#> # A tibble: 2 x 3
#> a b c
#> <chr> <chr> <chr>
#> 1 1 2 <NA>
#> 2 1 <NA> 3
The idiomatic way would be to splice the vector with !!! within a tibble() call so the named vector elements become column definitions :
library(tibble)
vec <- c("a" = 1, "b" = 2)
tibble(!!!vec)
#> # A tibble: 1 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 2
Created on 2019-09-14 by the reprex package (v0.3.0)
This works for me: c("a" = 1, "b" = 2) %>% t() %>% tbl_df()
Interestingly you can use the as_tibble() method for lists to do this in one call. Note that this isn't best practice since this isn't an exported method.
tibble:::as_tibble.list(vec)
as_tibble(as.list(c(a=1, b=2)))
Here is a simple example. The variables are only three, but could be many more. I would like a replacement for every c(X1,X2,X3) but can't find one.
library(dplyr)
library(MASS)
df <- data.frame(expand.grid(data.frame(matrix(rep(1:7,3),ncol=3))))
df1 <- df %>%
rowwise() %>%
filter(length(unique(c(X1,X2,X3)))==3)
df1 %>%
rowwise() %>%
filter(max(c(X1,X2,X3))- min(c(X1,X2,X3)) == 2) %>%
ungroup() %>%
summarise(res = n()/ nrow(df1)) %>%
unlist %>%
as.fractions
It really seems like everything() (newly fully exported) should do the trick, but it doesn't. Especially if you're going to be doing a lot of operations on all your columns, it may be worth it to make a list column with a vector of each row, on which you can easily call unique, max, etc. Here assembled with purrr, though you could do the same with apply(df, 1, list) %>% lapply(unlist):
library(purrr)
df1 <- df %>%
mutate(data = df %>% transpose() %>% map(unlist)) %>%
rowwise() %>%
filter(length(unique(data)) == 3)
df1
# Source: local data frame [210 x 4]
# Groups: <by row>
#
# X1 X2 X3 data
# <int> <int> <int> <list>
# 1 3 2 1 <int [3]>
# 2 4 2 1 <int [3]>
# 3 5 2 1 <int [3]>
# 4 6 2 1 <int [3]>
# 5 7 2 1 <int [3]>
# 6 2 3 1 <int [3]>
# 7 4 3 1 <int [3]>
# 8 5 3 1 <int [3]>
# 9 6 3 1 <int [3]>
# 10 7 3 1 <int [3]>
# .. ... ... ... ...
df1 %>%
rowwise() %>%
filter(max(data) - min(data) == 2) %>%
ungroup() %>%
summarise(res = n() / nrow(df1)) %>%
unlist %>%
as.fractions()
# res
# 1/7
We can do this also with data.table
library(data.table)
res <- setDT(df)[df[ ,uniqueN(unlist(.SD))==3 , 1:nrow(df)]$V1][,
sum(do.call(pmax, .SD)- do.call(pmin, .SD) ==2)/.N]
as.fractions(res)
#[1] 1/7
If we need to use dplyr
library(dplyr)
df1 <- df %>%
rowwise() %>%
do(data.frame(.,i1= n_distinct(unlist(.))==3)) %>%
filter(i1) %>%
dplyr::select(-i1)
df1 %>%
do(data.frame(., i2 = do.call(pmax, .) - do.call(pmin, .) == 2)) %>%
filter(i2) %>%
ungroup() %>%
summarise(n = n()/nrow(df1)) %>%
unlist %>%
as.fractions
# n
#1/7