Unfortunately, I have spotted a weird inconsistency in the colnames when cbind different 2 particular objects: tibbles that has been by_group()ed and matrix. I writing this here because I would understand what is going on under the hood with the cbind operation and these 2 objects.
Consider the following objects:
Simple tibble
library(tidyverse)
tbl <- tibble(tbl_name = seq(1,8))
# # A tibble: 8 x 1
# tbl_name
# <int>
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
Simple data.frame
df <- data.frame(df_name = seq(1,8))
df
# df_name
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
Simple matrix
mtx <- matrix(seq(1,8), nrow = 8)
colnames(mtx) <- "mtx_name"
# mtx_name
# [1,] 1
# [2,] 2
# [3,] 3
# [4,] 4
# [5,] 5
# [6,] 6
# [7,] 7
# [8,] 8
by_grouped tibble
tb2 <- tibble(tbl2_name = seq(1,8),
tbl_group_by = c("a","b","b","c","d","d","d","d"))
tb2 <- tb2 %>%
group_by(tbl_group_by) %>%
mutate(N_by_group = n())
# A tibble: 8 x 3
# Groups: tbl_group_by [4]
# tbl2_name tbl_group_by N_by_group
# <int> <chr> <int>
# 1 1 a 1
# 2 2 b 2
# 3 3 b 2
# 4 4 c 1
# 5 5 d 4
# 6 6 d 4
# 7 7 d 4
# 8 8 d 4
When cbind them:
>This works (a.k.a: keeps the correct names)
# Comparison
# tibble & data.frame: OK
cbind(tbl,df)
# tbl_name df_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
# matrix & data.frame: OK
cbind(mtx,df)
# mtx_name df_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
# tibble & matrix: OK
cbind(tbl,mtx)
# tbl_name mtx_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
This doesn't work (a.k.a: destroyed the colname of the matrix)
# tibble(group_by()) & matrix: oops!!!!
cbind(tb2,mtx)
# New names:
# * NA -> ...4
# # A tibble: 8 x 4
# # Groups: tbl_group_by [4]
# tbl2_name tbl_group_by N_by_group ...4[,"mtx_name"]
# <int> <chr> <int> <int>
# 1 1 a 1 1
# 2 2 b 2 2
# 3 3 b 2 3
# 4 4 c 1 4
# 5 5 d 4 5
# 6 6 d 4 6
# 7 7 d 4 7
# 8 8 d 4 8
Any intuition of what's happening or how to prevent it, is very welcome. Thank you in advance.
We can remove the group attributes with ungroup and now cbind should work
library(dplyr)
cbind(ungroup(tb2), mtx)
-output
# tbl2_name tbl_group_by N_by_group mtx_name
#1 1 a 1 1
#2 2 b 2 2
#3 3 b 2 3
#4 4 c 1 4
#5 5 d 4 5
#6 6 d 4 6
#7 7 d 4 7
#8 8 d 4 8
Or specifically use cbind.data.frame because by default it may use cbind.matrix
cbind.data.frame(tb2, mtx)
When we create the 'tb2', after grouping, make sure to ungroup to prevent this kind of issues
tb2 <- tb2 %>%
group_by(tbl_group_by) %>%
mutate(N_by_group = n()) %>%
ungroup
Or make use of is_grouped_df to find if the data is grouped or not and then ungroup
f1 <- function(dat) {
if(dplyr::is_grouped_df(dat)) {
dat <- ungroup(dat)
}
dat
}
cbind(f1(tb2), mtx)
Related
Hello I have a data frame of 245 columns but to add some sets and generate new columns try to do it recursively as follows
cl1<-sample(1:4,10,replace=TRUE)
cl2<-sample(1:4,10,replace=TRUE)
cl3<-sample(1:4,10,replace=TRUE)
cl4<-sample(1:4,10,replace=TRUE)
cl5<-sample(1:4,10,replace=TRUE)
cl6<-sample(1:4,10,replace=TRUE)
dat<-data.frame(cl1,cl2,cl3,cl4,cl5,cl6)
my intention is to add column 1 with column 3 and 5, likewise column 2 with 4 and 6 and in the end obtain a dataframe with two columns
and you should pay me something like that
I have programmed the following code
revisar<- function(a){
todos = list()
i=1
j=3
l=5
k=1
while(i<=2 ){
cl<-a[,i]
cl2<-a[,j]
cl3<-a[,l]
cl[is.na(cl)] <- 0
cl2[is.na(cl2)] <- 0
cl3[is.na(cl3)] <- 0
colu<-cl+cl2+cl3
col<-cbind(colu,colu)
i<-i+1
j<-j+1
l<-l+1
k<-k+1
}
return(col)
}
it turns out that it only returns column 2 repeated twice and I must replicate the same thing to join those 245 columns.7
I would like to know what is failing the example
base R
Literal programming:
with(dat, data.frame(s1 = cl1+cl3+cl5, s2 = cl2+cl4+cl6))
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
Programmatically,
L <- list(s1 = c(1,3,5), s2 = c(2,4,6))
out <- data.frame(lapply(L, function(z) do.call(rowSums, list(as.matrix(dat[,z])))))
out
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
dplyr
library(dplyr)
dat %>%
transmute(
s1 = rowSums(cbind(cl1, cl3, cl5)),
s2 = rowSums(cbind(cl2, cl4, cl6))
)
or programmatically using purrr:
purrr::map_dfc(L, ~ rowSums(dat[, .]))
Data
set.seed(42)
# your `dat` above
Here is an alternative general approach:
Here we sum all uneven columns -> s1 and
all even columns -> s2:
library(dplyr)
dat %>%
rowwise() %>%
mutate(s1 = sum(c_across(seq(1,ncol(dat),2)), na.rm = TRUE),
s2 = sum(c_across(seq(2,ncol(dat),2)), na.rm = TRUE))
cl1 cl2 cl3 cl4 cl5 cl6 s1 s2
<int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 3 2 3 2 7 5
2 2 4 1 4 2 3 5 11
3 2 2 2 2 1 3 5 7
4 2 4 4 3 1 4 7 11
5 2 4 4 3 2 2 8 9
6 3 3 3 2 2 2 8 7
7 2 1 1 2 1 4 4 7
8 2 4 1 3 2 3 5 10
9 3 1 1 2 3 4 7 7
10 2 4 1 3 4 4 7 11
xx is a sample data. it contains variables dep1,dep2,dep3,bet1,bet2,bet3. I want to select all possible 2 columns combinations but not the ones with the same name (except,number) . In this examples there are
9 such combos {dep1:bet1,dep1:bet2,dep1:bet3,dep2:bet1...........}
Below is code which I want to run for all combinations ( I did it just for one) also in last line
I added a code to keep track which variables were included in calculations. I believe the regex will help
to understand. help appreciated !
xx<-data.frame(id=1:10,
category=c(rep("A",5),rep("B",5)),
dep1=sample(1:5,10,replace = T),
dep2=sample(1:5,10,replace = T),
dep3=sample(1:5,10,replace = T),
bet1=sample(1:5,10,replace = T),
bet2=sample(1:5,10,replace = T),
bet3=sample(1:5,10,replace = T))
xx%>%select(2,dep1,bet1)%>%
mutate(vdep=if_else(dep1>3,1,0),
vbet=if_else(bet1>3,1,0))%>%
group_by(category)%>%
summarise(vdep=mean(vdep),
vbet=mean(vbet))%>%ungroup()%>%
gather(variable,value,-category)%>%
mutate(variable=as.factor(variable))%>%
unite(variable,category,col = "new")%>%
spread(new,value)%>%
mutate(first="dep1",second="bet1")
If I understand you correctly, something like the following should do it:
# the data
xx<-data.frame(id=1:10,
category=c(rep("A",5),rep("B",5)),
dep1=sample(1:5,10,replace = T),
dep2=sample(1:5,10,replace = T),
dep3=sample(1:5,10,replace = T),
bet1=sample(1:5,10,replace = T),
bet2=sample(1:5,10,replace = T),
bet3=sample(1:5,10,replace = T))
# Getting the column names with "dep" or "bet"
cols = names(xx)[grepl("dep|bet", names(xx))]
deps = cols[grepl("dep", cols)]
bets = cols[grepl("bet", cols)]
# Getting all possible combinations of these columns
comb = expand.grid(deps, bets)
comb
# Var1 Var2
# 1 dep1 bet1
# 2 dep2 bet1
# 3 dep3 bet1
# 4 dep1 bet2
# 5 dep2 bet2
# 6 dep3 bet2
# 7 dep1 bet3
# 8 dep2 bet3
# 9 dep3 bet3
# Transposing the dataframe containing these combinations, so that
# we can directly use sapply / lapply on the columns latter
comb = data.frame(t(comb), stringsAsFactors = FALSE)
# For each combination, subset the dataframe xx
result = sapply(comb, function(x){
xx[, x]
}, simplify = FALSE)
result
# $X1
# dep1 bet1
# 1 1 5
# 2 1 5
# 3 2 2
# 4 2 2
# 5 1 5
# 6 3 3
# 7 1 1
# 8 2 2
# 9 3 2
# 10 1 5
#
# $X2
# dep2 bet1
# 1 1 5
# 2 2 5
# 3 4 2
# 4 5 2
# 5 1 5
# 6 5 3
# 7 2 1
# 8 1 2
# 9 4 2
# 10 4 5
#
# $X3
# dep3 bet1
# 1 3 5
# 2 2 5
# 3 4 2
# 4 3 2
# 5 3 5
# 6 2 3
# 7 1 1
# 8 4 2
# 9 5 2
# 10 5 5
#
# $X4
# dep1 bet2
# 1 1 5
# 2 1 1
# 3 2 1
# 4 2 2
# 5 1 2
# 6 3 2
# 7 1 3
# 8 2 3
# 9 3 5
# 10 1 1
#
# $X5
# dep2 bet2
# 1 1 5
# 2 2 1
# 3 4 1
# 4 5 2
# 5 1 2
# 6 5 2
# 7 2 3
# 8 1 3
# 9 4 5
# 10 4 1
#
# $X6
# dep3 bet2
# 1 3 5
# 2 2 1
# 3 4 1
# 4 3 2
# 5 3 2
# 6 2 2
# 7 1 3
# 8 4 3
# 9 5 5
# 10 5 1
#
# $X7
# dep1 bet3
# 1 1 3
# 2 1 2
# 3 2 5
# 4 2 1
# 5 1 3
# 6 3 2
# 7 1 4
# 8 2 1
# 9 3 1
# 10 1 3
#
# $X8
# dep2 bet3
# 1 1 3
# 2 2 2
# 3 4 5
# 4 5 1
# 5 1 3
# 6 5 2
# 7 2 4
# 8 1 1
# 9 4 1
# 10 4 3
#
# $X9
# dep3 bet3
# 1 3 3
# 2 2 2
# 3 4 5
# 4 3 1
# 5 3 3
# 6 2 2
# 7 1 4
# 8 4 1
# 9 5 1
# 10 5 3
Is there an R function which can pass all the elements of a list as the arguments of a function?
library(tidyr)
a <- c(1,2,3)
b <- c(4,5,6)
c <- c(7,8,9)
d <- list(a,b,c)
crossing(d[[1]],d[[2]],d[[3]])
Instead of specifying d[[1]],d[[2]],d[[3]], i'd like to just include d
Expected result:
> crossing(d[[1]],d[[2]],d[[3]])
# A tibble: 27 x 3
`d[[1]]` `d[[2]]` `d[[3]]`
<dbl> <dbl> <dbl>
1 1 4 7
2 1 4 8
3 1 4 9
4 1 5 7
5 1 5 8
6 1 5 9
7 1 6 7
8 1 6 8
9 1 6 9
10 2 4 7
# ... with 17 more rows
You can use do.call to executes a function call and a list of arguments to be passed to it.
c(d[[1]],d[[2]],d[[3]])
#[1] 1 2 3 4 5 6 7 8 9
do.call("c", d)
#[1] 1 2 3 4 5 6 7 8 9
And for crossing, which needs not duplicated Column names:
library(tidyr)
names(d) <- seq_along(d)
do.call(crossing, d)
## A tibble: 27 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
# 1 1 4 7
# 2 1 4 8
# 3 1 4 9
# 4 1 5 7
# 5 1 5 8
# 6 1 5 9
# 7 1 6 7
# 8 1 6 8
# 9 1 6 9
#10 2 4 7
## … with 17 more rows
I have two lists. Each of them with many vectors (around 500) of different lengths and I would like to get a tibble data frame with three columns.
My reproducible example is the following:
> a
[[1]]
[1] 1 3 6
[[2]]
[1] 5 4
> b
[[1]]
[1] 3 4
[[2]]
[1] 5 6 7
I would like to get the following tibble data frame:
name index value
a 1 1
a 1 3
a 1 6
a 2 5
a 2 4
b 1 3
b 1 4
b 2 5
b 2 6
b 2 7
I would be grateful if someone could help me with this issue
using Base R:
transform(stack(c(a=a,b=b)),name=substr(ind,1,1),ind=substr(ind,2,2))
values ind name
1 1 1 a
2 2 1 a
3 3 1 a
4 5 2 a
5 6 2 a
6 3 1 b
7 4 1 b
8 5 2 b
9 6 2 b
10 7 2 b
using tidyverse:
library(tidyverse)
list(a=a,b=b)%>%map(~stack(setNames(.x,1:length(.x))))%>%bind_rows(.id = "name")
name values ind
1 a 1 1
2 a 2 1
3 a 3 1
4 a 5 2
5 a 6 2
6 b 3 1
7 b 4 1
8 b 5 2
9 b 6 2
10 b 7 2
Here is one option with tidyverse
library(tidyverse)
list(a= a, b = b) %>%
map_df(enframe, name = "index", .id = 'name') %>%
unnest
# A tibble: 10 x 3
# name index value
# <chr> <int> <dbl>
# 1 a 1 1
# 2 a 1 3
# 3 a 1 6
# 4 a 2 5
# 5 a 2 4
# 6 b 1 3
# 7 b 1 4
# 8 b 2 5
# 9 b 2 6
#10 b 2 7
data
a <- list(c(1, 3, 6), c(5, 4))
b <- list(c(3, 4), c(5, 6, 7))
What is the best way to batch rename columns using a lookup data frame?
Can I do it as part of a pipe?
library(tidyverse)
df <- data_frame(
a = seq(1, 10)
, b = seq(10, 1)
, c = rep(1, 10)
)
df_lookup <- data_frame(
old_name = c("b", "c", "a")
, new_name = c("y", "z", "x")
)
I know how to do it manually
df %>%
rename(x = a
, y = b
, z = c)
I am seeking a solution in tidyverse / dplyr packages.
Use rlang; Firstly build up a list of names using syms, and then splice the arguments to rename with UQS or !!! operator:
library(rlang); library(dplyr)
df %>% rename(!!!syms(with(df_lookup, setNames(old_name, new_name))))
# A tibble: 10 x 3
# x y z
# <int> <int> <dbl>
# 1 1 10 1
# 2 2 9 1
# 3 3 8 1
# 4 4 7 1
# 5 5 6 1
# 6 6 5 1
# 7 7 4 1
# 8 8 3 1
# 9 9 2 1
#10 10 1 1
You could write your own helper to make it easier
rename_to <- function(data, old, new) {
data %>% rename_at(old, function(x) new[old==x])
}
df %>% rename_to(df_lookup$old_name, df_lookup$new_name)
In base-R:
names(df)[match(df_lookup$old_name,names(df))] <- df_lookup$new_name
# # A tibble: 10 x 3
# x y z
# <int> <int> <dbl>
# 1 1 10 1
# 2 2 9 1
# 3 3 8 1
# 4 4 7 1
# 5 5 6 1
# 6 6 5 1
# 7 7 4 1
# 8 8 3 1
# 9 9 2 1
# 10 10 1 1
Using data.table:
library(data.table)
setnames(setDT(df), old = df_lookup$old_name, new = df_lookup$new_name)
# x y z
# 1: 1 10 1
# 2: 2 9 1
# 3: 3 8 1
# 4: 4 7 1
# 5: 5 6 1
# 6: 6 5 1
# 7: 7 4 1
# 8: 8 3 1
# 9: 9 2 1
# 10: 10 1 1