append or bind columns iteratively through a dataframe - r

I have a dataframe with 900 columns. I want to use tidyverse to append/bind columns in multiples of three (or another number). For example, append columns 2:3 to 1; columns 5:6 to 4, columns 8:9 to 7, and so on for the entire dataframe. Thus at the end I will have 300 columns, while keeping the name of the main column (where other columns have been appended to).
How do I do this? Thank you very much :)

A tidyverse approach:
library(tidyverse)
# data
df = data.frame(matrix(1:27, ncol=9))
names(df) <- paste('Int', rep(1:3, each=3), 'A', rep(1:3, 3), sep='_')
n = 3
df %>%
# split the data frame into three data frames
split.default(rep(1:n, ncol(df) / n)) %>%
# rename and row bind the three data frames together
map_df(
~ set_names(.x, names(df)[c(T, rep(F, n - 1))]) %>%
tibble::rownames_to_column('gene')
)
# gene Int_1_A_1 Int_2_A_1 Int_3_A_1
#1 1 1 10 19
#2 2 2 11 20
#3 3 3 12 21
#4 1 4 13 22
#5 2 5 14 23
#6 3 6 15 24
#7 1 7 16 25
#8 2 8 17 26
#9 3 9 18 27
More notes on set_names: c(T, rep(F, n - 1)) first create a vector as c(T, F, F, ...), and so names(df)[c(T, rep(F, n - 1))] picks up a name every n elements due to R Cycling rule.
Or if you start from a matrix, you can reshape it with array function and desired shape:
m = matrix(1:27, ncol=9)
m
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
#[1,] 1 4 7 10 13 16 19 22 25
#[2,] 2 5 8 11 14 17 20 23 26
#[3,] 3 6 9 12 15 18 21 24 27
array(m, c(nrow(m) * 3, ncol(m) / 3))
# [,1] [,2] [,3]
# [1,] 1 10 19
# [2,] 2 11 20
# [3,] 3 12 21
# [4,] 4 13 22
# [5,] 5 14 23
# [6,] 6 15 24
# [7,] 7 16 25
# [8,] 8 17 26
# [9,] 9 18 27
To keep the names, you can use data.table::melt:
library(data.table)
Sample Data:
df = data.frame(matrix(1:27, ncol=9))
names(df) <- paste('Int', rep(1:3, each=3), 'A', rep(1:3, 3), sep='_')
df
# Int_1_A_1 Int_1_A_2 Int_1_A_3 Int_2_A_1 Int_2_A_2 Int_2_A_3 Int_3_A_1 Int_3_A_2 Int_3_A_3
#1 1 4 7 10 13 16 19 22 25
#2 2 5 8 11 14 17 20 23 26
#3 3 6 9 12 15 18 21 24 27
# create the patterns that group data frames
cols <- paste0('Int_', seq_len(ncol(df) / 3), '_A')
# melt the data.table based on the column patterns and here you also get an id column telling
# you where the data comes from the 1st, 2nd or 3rd ..
setNames(melt(setDT(df), measure=patterns(cols)), c('id', cols))
# id Int_1_A Int_2_A Int_3_A
#1: 1 1 10 19
#2: 1 2 11 20
#3: 1 3 12 21
#4: 2 4 13 22
#5: 2 5 14 23
#6: 2 6 15 24
#7: 3 7 16 25
#8: 3 8 17 26
#9: 3 9 18 27

A solution can be achieved using tidyr::unite and tidyr::separate_rows. The approach is to first unite columns in group of 3 and then use tidyr::separate_rows function to expand those in rows.
I have taken data created by #Psidom in his answer. Also, I should mention that data.table::melt based is most appropriate for problem. But one can explore different ideas using different approach.
library(tidyverse)
# data
df = data.frame(matrix(1:27, ncol=9))
names(df) <- paste('Int', rep(1:3, each=3), 'A', rep(1:3, 3), sep='_')
lapply(split(names(df),cut(1:ncol(df),3, labels = seq_len(ncol(df) / 3))),
function(x){unite_(df[,x], paste(x[1],x[3], sep = ":"), x, sep = ",",
remove = TRUE)}) %>%
bind_cols() %>%
separate_rows(., seq_len(ncol(.)), sep = ",")
# Int_1_A_1:Int_1_A_3 Int_2_A_1:Int_2_A_3 Int_3_A_1:Int_3_A_3
# 1 1 10 19
# 2 4 13 22
# 3 7 16 25
# 4 2 11 20
# 5 5 14 23
# 6 8 17 26
# 7 3 12 21
# 8 6 15 24
# 9 9 18 27

A base R solution:
df <- head(mtcars)[-1:-2] # 9 cols
df[(seq(df)-1) %% 3 == 0] <-
lapply(split(seq(df), (seq(df)-1) %/% 3),
function(x) apply(df[x], 1, paste, collapse="_"))
df <- df[(seq(df)-1) %% 3 == 0]
df
# disp wt am
# Mazda RX4 160_110_3.9 2.62_16.46_0 1_4_4
# Mazda RX4 Wag 160_110_3.9 2.875_17.02_0 1_4_4
# Datsun 710 108_93_3.85 2.32_18.61_1 1_4_1
# Hornet 4 Drive 258_110_3.08 3.215_19.44_1 0_3_1
# Hornet Sportabout 360_175_3.15 3.44_17.02_0 0_3_2
# Valiant 225_105_2.76 3.46_20.22_1 0_3_1

Related

How to split integer vector in R

I have a vector of integers that I want to split by 3, then I have to order the splitted parts and put bac into integer vector.
as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
First step - split like this:
16,9,2
17,10,3
18,11,4
19,12,5
20,13,6
21,14,7
22,15,8
Second step - order:
2,9,16
3,10,17
4,11,18
5,12,19
6,13,20
7,14,21
8,15,22
Third step - put back into integer vector:
2,9,16,3,10,17,4,11,18,5,12,19,6,13,20,7,14,21,8,15,22
With matrix + sort:
x <- as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
c(apply(matrix(x, ncol = 3, byrow = T), 1, sort))
#[1] 2 9 16 3 10 17 4 11 18 5 12 19 6 13 20 7 14 21 8 15 22
Or with split + gl:
unlist(lapply(split(x, gl(length(x) / 3, 3)), sort))
Another shorter approach with split + rev (only works if rev and sort are the same):
c(do.call(rbind, rev(split(x, 1:3))))
#[1] 2 9 16 3 10 17 4 11 18 5 12 19 6 13 20 7 14 21 8 15 22
No {dplyr} required here.
x <- as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
spl.x <- split(x, ceiling(seq_along(x)/3)) # split the vector
spl.x <- lapply(spl.x, sort) # sort each element of the list
Reduce(c, spl.x) # Reduce list to vector
Second line (splitting) is from this answer: https://stackoverflow.com/a/3321659/2433233
This also works if the length of your original vector is no multiple of 3. The last list element is shorter in this case.
Here is one way to do steps in order:
vector=as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
chunk <- 3
n <- length(vector)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
list_of3 <- split(vector,r)
# > list_of3
# $`1`
# [1] 16 9 2
#
# $`2`
# [1] 17 10 3
#
# $`3`
# [1] 18 11 4
#
# $`4`
# [1] 19 12 5
#
# $`5`
# [1] 20 13 6
#
# $`6`
# [1] 21 14 7
#
# $`7`
# [1] 22 15 8
sorted_list<- lapply(list_of3, function(x)sort(x))
final_vector <- unname(unlist(sorted_list))
final_vector
# > final_vector
# [1] 2 9 16 3 10 17 4 11 18 5 12 19 6 13 20 7 14 21 8 15 22```
Here is one way to do it:
v <- as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
res <- split(v, 0:(length(v)-1) %/%3)
unlist(lapply(res, sort), use.names = FALSE)
You can put your data into a 3 column matrix by row, sort rowwise, transpose and convert back to vector:
v <- as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
m <- matrix(v, ncol = 3, byrow = TRUE)
c(t(matrix(m[order(row(m), m)], nrow(m), byrow = TRUE)))
[1] 2 9 16 3 10 17 4 11 18 5 12 19 6 13 20 7 14 21 8 15 22
Something like this goes through every step:
v = as.integer(c(16,9,2,17,10,3,18,11,4,19,12,5,20,13,6,21,14,7,22,15,8))
v2 = v %>% matrix(ncol= 3, byrow = T)
# [,1] [,2] [,3]
# [1,] 16 9 2
# [2,] 17 10 3
# [3,] 18 11 4
# [4,] 19 12 5
# [5,] 20 13 6
# [6,] 21 14 7
# [7,] 22 15 8
v3 = v2[, rev(seq_len(ncol(v2)))]
# [,1] [,2] [,3]
# [1,] 2 9 16
# [2,] 3 10 17
# [3,] 4 11 18
# [4,] 5 12 19
# [5,] 6 13 20
# [6,] 7 14 21
# [7,] 8 15 22
v4 = v3 %>% as.vector
# [1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

How to split one column in to multiple columns with pattern in R

we know a column x with a vector of like 21 numbers:
x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
If I want to get multiple columns with flexible pattern like :
can set numbers in advance (n could be 3 or 4 or ...):
n=3,n1=2,n2=3,n3=2,.... the total number of columns is determined by number n.
column n=3, for column1:row=n*n1 and column2: row= n*n2, column3:row=n*n3 (Here, the number could be variables)
Final output is:(this is n=3 case, but my final goal is n could be 4,5...)
1 7 16
2 8 17
3 9 18
4 10 19
5 11 20
6 12 21
13
14
15
If n set as n=2,n1=3,n2=4. The one column number would become 14 c(1:14). (The real practice is I do not know how many columns needed to be created in advance. The column number is input by users).
Then what I what to get n =2 columns:
1 7
2 8
3 9
4 10
5 11
6 12
13
14
I am trying to make the columns created automatically in advance with variables.
Many thanks.
We can create an grouping variable with rep and split
split(df1$x, rep(1:3, c(6, 9, 6)))
#$`1`
#[1] 1 2 3 4 5 6
#$`2`
#[1] 7 8 9 10 11 12 13 14 15
#$`3`
#[1] 16 17 18 19 20 21
A function can be created with arguments, 'n', and additional arguments with ...
f1 <- function(dat, n, ...) {
rgrp <- n * c(...)
split(dat[[1]][seq_len(sum(rgrp))], rep(seq_len(n), rgrp))
}
f1(df1, 2, 3, 4)
#$`1`
#[1] 1 2 3 4 5 6
#$`2`
#[1] 7 8 9 10 11 12 13 14
f1(df1, 3, 2, 3, 2)
#$`1`
#[1] 1 2 3 4 5 6
#$`2`
#[1] 7 8 9 10 11 12 13 14 15
#$`3`
#[1] 16 17 18 19 20 21
If the user submits a vector and we don't have n, then get the n from the length of the vector
f1 <- function(dat, vec) {
n <- length(vec)
rgrp <- n * vec
split(dat[[1]][seq_len(sum(rgrp))], rep(seq_len(n), rgrp))
}
f1(df1, 3:4)
If the user input 'n1', 'n2', we can use ...
f1 <- function(dat, ...) {
vec <- c(...)
n <- length(vec)
rgrp <- n * vec
split(dat[[1]][seq_len(sum(rgrp))], rep(seq_len(n), rgrp))
}
f1(df1, 3, 4)
data
df1 <- structure(list(x = 1:21), class = "data.frame", row.names = c(NA,
-21L))

ggplot: 3D matrix: plot multiple lines in a graph and multiple graphs on the same page

I have a 3D data matrix (df) of the shape[1:1000,1:221,1:2],
a reproducible example is the following:
d <- as.data.frame( matrix( 1:(5*2*3), 10, 3))
df = array( unlist(d), dim=c(5, 2, 3))
df
, , 1
[,1] [,2]
[1,] 1 6
[2,] 2 7
[3,] 3 8
[4,] 4 9
[5,] 5 10
, , 2
[,1] [,2]
[1,] 11 16
[2,] 12 17
[3,] 13 18
[4,] 14 19
[5,] 15 20
, , 3
[,1] [,2]
[1,] 21 26
[2,] 22 27
[3,] 23 28
[4,] 24 29
[5,] 25 30
the first dimension is trails, and the second dimension is outcomes, and the third dimension is people.
For each person, I want to get a graph like the following (a excel plot for the first person, df[,,1])
I want to have such a plot for each person displayed on the same page, but I am stuck on how to achieve this using ggplot.
Using your data, you can first re-organize your array in a dataframe (there are maybe easier ways to achieve this part):
final_df = NULL
nb_person = 3
trail = NULL
person = NULL
for(i in 1:nb_person) {
final_df = rbind(final_df, df[,,i])
trail = c(trail, 1:dim(df[,,i])[1])
person = c(person,rep(i,dim(df[,,i])[1]))
}
final_df = data.frame(final_df)
colnames(final_df) = c("start","end")
final_df$trail = trail
final_df$person = person
start end trail person
1 1 6 1 1
2 2 7 2 1
3 3 8 3 1
4 4 9 4 1
5 5 10 5 1
6 11 16 1 2
7 12 17 2 2
8 13 18 3 2
9 14 19 4 2
10 15 20 5 2
11 21 26 1 3
12 22 27 2 3
13 23 28 3 3
14 24 29 4 3
15 25 30 5 3
Then, you can reshape it using pivot_longer function from the package tidyr (if you install and load tidyverse, both tidyr and ggplot2 will be installed and loaded).
library(tidyverse)
final_df_reshaped <- final_df %>% pivot_longer(., -c(trail,person),names_to = "Variable",values_to = "value")
# A tibble: 30 x 4
trail person Variable value
<int> <int> <chr> <int>
1 1 1 start 1
2 1 1 end 6
3 2 1 start 2
4 2 1 end 7
5 3 1 start 3
6 3 1 end 8
7 4 1 start 4
8 4 1 end 9
9 5 1 start 5
10 5 1 end 10
# … with 20 more rows
Alternative using gather for older versions of tidyr
If you have an older version of tidyr (below 1.0.0), you should use gather instead of pivot_longer. (more information here: https://cmdlinetips.com/2019/09/pivot_longer-and-pivot_wider-in-tidyr/)
final_df_reshaped <- final_df %>% gather(., -c(trail,person), key = "Variable",value = "value")
And plot it using this code:
ggplot(final_df_reshaped, aes(x = Variable, y = value, group = as.factor(trail), color = as.factor(trail)))+
geom_point()+
geom_line() +
facet_grid(.~person)+
scale_x_discrete(limits = c("start","end"))
Does it answer your question ?
If you have to do that for 220 different person, I'm not sure it will make a really lisible plot. Maybe you should think an another way to plot it or to extract the useful information.

R add arrays by column name

This seems really basic but I can't figure it out. How do you add two arrays together in R by column name? For example:
a<-matrix(1:9,ncol=3)
colnames(a)<-c("A","B","C")
a
# A B C
#[1,] 1 4 7
#[2,] 2 5 8
#[3,] 3 6 9
b <-matrix(10:18,ncol=3)
colnames(b)<-c("C","B","D")
b
# C B D
#[1,] 10 13 16
#[2,] 11 14 17
#[3,] 12 15 18
I would like to add them together in such a way to yield:
# A B C D
#[1,] 1 17 17 16
#[2,] 2 19 19 17
#[3,] 3 21 21 18
I suppose I could add extra columns to both matrices but it seems like there would be a one line command to accomplish this. Thanks!
Using xtabs , after melting a combined table to a long data.frame:
xtabs(Freq ~ ., data=as.data.frame.table(cbind(a,b)))
# Var2
#Var1 A B C D
# A 1 17 17 16
# B 2 19 19 17
# C 3 21 21 18
The rownames will just be cycling through LETTERS
We could use melt/acast from reshape2 after cbinding both the 'a' and 'b' matrices (inspired from #thelatemail's post).
library(reshape2)
acast(melt(cbind(a,b)), Var1~Var2, value.var='value', sum)
# A B C D
#1 1 17 17 16
#2 2 19 19 17
#3 3 21 21 18
Or we find the column names that are common in both by using intersect, column names that is found in one matrix and not in other with setdiff. By subsetting both the matrices with the common names, we add it together, then cbind the columns in both 'a' and 'b' based on the setdiff output.
nm1 <- intersect(colnames(a), colnames(b))
nm2 <- setdiff(colnames(a), colnames(b))
nm3 <- setdiff(colnames(b), colnames(a))
cbind(a[,nm2, drop=FALSE], a[,nm1]+b[,nm1], b[,nm3,drop=FALSE])
# A B C D
#[1,] 1 17 17 16
#[2,] 2 19 19 17
#[3,] 3 21 21 18
Another option would be create another matrix with all the unique columns in 'a' and 'b', and then replace the values in that
nm <- union(colnames(a), colnames(b))
m1 <- matrix(0, ncol=length(nm), nrow=nrow(a), dimnames=list(NULL, nm))
m1[,colnames(a)] <- a
m1[,colnames(b)] <- m1[,colnames(b)] +b
m1
# A B C D
#[1,] 1 17 17 16
#[2,] 2 19 19 17
#[3,] 3 21 21 18
We could also cbind both the matrices and use tapply to get the sum after grouping with column and row indices
m2 <- cbind(a, b)
t(tapply(m2,list(colnames(m2)[col(m2)], row(m2)), FUN=sum))
Or we loop through 'nm' and get the sum
sapply(nm, function(i) rowSums(m2[,colnames(m2) ==i, drop=FALSE]))

get pairwise sums of multiple columns in dataframe

I have a dataframe that looks like this:
x<-data.frame(a=6, b=5:1, c=7, d=10:6)
> x
a b c d
1 6 5 7 10
2 6 4 7 9
3 6 3 7 8
4 6 2 7 7
5 6 1 7 6
I am trying to get the sums of columns a & b and c&d in another data frame that should look like:
> new
ab cd
1 11 17
2 10 16
3 9 15
4 8 14
5 7 13
I've tried the rowSums() function but it returns the sum of ALL the columns per row, and I tried rowSums(x[c(1,2), c(3,4)]) but nothing works. Please help!!
You can use rowSums on a column subset.
As a data frame:
data.frame(ab = rowSums(x[c("a", "b")]), cd = rowSums(x[c("c", "d")]))
# ab cd
# 1 11 17
# 2 10 16
# 3 9 15
# 4 8 14
# 5 7 13
As a matrix:
cbind(ab = rowSums(x[1:2]), cd = rowSums(x[3:4]))
For a wider data frame, you can also use sapply over a list of column subsets.
sapply(list(1:2, 3:4), function(y) rowSums(x[y]))
For all pairwise column combinations:
y <- combn(ncol(x), 2L, function(y) rowSums(x[y]))
colnames(y) <- combn(names(x), 2L, paste, collapse = "")
y
# ab ac ad bc bd cd
# [1,] 11 13 16 12 15 17
# [2,] 10 13 15 11 13 16
# [3,] 9 13 14 10 11 15
# [4,] 8 13 13 9 9 14
# [5,] 7 13 12 8 7 13
Here's another option:
> sapply(split.default(x, 0:(length(x)-1) %/% 2), rowSums)
0 1
[1,] 11 17
[2,] 10 16
[3,] 9 15
[4,] 8 14
[5,] 7 13
The 0:(length(x)-1) %/% 2 step creates a sequence of groups of 2 that can be used with split. It will also handle odd numbers of columns (treating the final column as a group of its own). Since there's a different default split "method" for data.frames that splits by rows, you need to specify split.default to split the columns into groups.

Resources