Condense a matrix in R - r

I have loaded a table of integer data with 2,200 columns. What I'd like to do is condense the data down by averaging the values in every 5 columns and placing that in a new column in a new table.
For example, if I had:
Col1 | Col2 | Col3 | Col4 | Col5 | Col6 | Col7 | Col8 | Col9 | Col10
2 4 6 8 10 12 14 16 18 20
I would get:
Col1 | Col2
6 16
Which is just the average of the values in columns 1-5 from the original table in Col1 and the average of the values in columns 6-10 in Col2.
I haven't quite wrapped my head around R syntax, so any help would be appreciated.

Here's one approach that's applicable if the number of elements to be grouped is divisible by n (5, in your case):
x <- 1:100
n <- 5
tapply(x, rep(seq(1, length(x), n), each=n), mean)
# 1 6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96
# 3 8 13 18 23 28 33 38 43 48 53 58 63 68 73 78 83 88 93 98
The first row of output contains element names, and the second row contains means of successive groups of n elements.
To apply this to all rows of a matrix or data.frame, you can do, e.g.:
m <- matrix(1:1000, ncol=100)
apply(m, 1, function(x) tapply(x, rep(seq(1, length(x), n), each=n), mean))
EDIT
This alternative approach will give you some performance gains due to vectorisation with rowMeans:
t(mapply(function(x, y) rowMeans(m[, x:y]),
seq(1, ncol(m), n), seq(n, ncol(m), n)))

Oops, I see this is the comment of #user20650 in #jbaums answer. The rowsum function splits rows of a matrix by a factor, and sums the columns of each split. So for
m <- matrix(1:1000, ncol=100)
n <- 5
we have
rowsum(t(m), rep(seq_len(ncol(m) / n), each=n)) / n
This is fast, if that's important
library(microbenchmark)
f0 = function(m, n) rowsum(t(m), rep(seq_len(ncol(m) / n), each=n)) / n
f1 = function(m, n)
apply(m, 1, function(x) tapply(x, rep(seq(1, length(x), n), each=n), mean))
f2 = function(m, n)
t(mapply(function(x, y) rowMeans(m[, x:y]),
seq(1, ncol(m), n), seq(n, ncol(m), n)))
all.equal(f0(m, n), f1(m, n), check.attributes=FALSE)
## [1] TRUE
all.equal(f0(m, n), f2(m, n), check.attributes=FALSE)
## [1] TRUE
microbenchmark(f0(m, n), f1(m, n), f2(m, n))
## Unit: microseconds
## expr min lq median uq max neval
## f0(m, n) 164.351 170.1675 176.730 187.8570 237.419 100
## f1(m, n) 8060.639 8513.3035 8696.742 8908.5190 9771.019 100
## f2(m, n) 540.894 588.3820 603.787 634.1615 732.209 100

Here's another approach using a loop and rowMeans instead, in case you prefer a loop in this case. Will work for matrices, but needs adjustment for vectors.
# example data
dat <- as.data.frame( matrix(1:20,ncol=10,byrow=TRUE) )
# pick range
range <- 5
ind <- seq(1,ncol(dat),range)
newdat <- NULL
for(i in ind){
newcol <- rowMeans(dat[,i:(i+range-1)])
newdat <- cbind(newdat, newcol)
}
Will result in:
> newdat
newcol newcol
[1,] 3 8
[2,] 13 18

#jbaums answer looks pretty good. Since I had already started this answer, I thought I would post my solution as well.
#Make some fake data
require(data.table)
data <- data.table(t(iris[,1:4]))
#Transpose since rows are easier to deal with than columns
data <- data.table(t(data))
data[ , row := .I]
#Sum by every 5 rows
data <- data[ , lapply(.SD,sum), by=cut(row,seq(0,nrow(data),5))]
#Transpose back to original results
result <- data.table(t(data))

If you wanted to get the means of the elements from col1-col5, col6-col10, etc.
m1 <- matrix(c(rep(1:100, 2), 1:20), ncol=22)
n <- 5
p1 <- prod(dim(m1))
n1 <- nrow(m1)*n
n2 <- p1-p1%%n1
c(rowMeans(matrix(m1[1:n2], nrow=p1%/%n1, byrow=TRUE)), mean(m1[(n2+1):p1]))
#[1] 25.5 75.5 25.5 75.5 10.5
Or
sapply(seq(1,ncol(m1), by=n), function(i) mean(m1[,i:(min(c(i+n-1), ncol(m1)))]) )
#[1] 25.5 75.5 25.5 75.5 10.5
With some labels
indx <- seq(1,n2/nrow(m1), by=n)
indx1 <- paste("Col",paste(indx, indx+4, sep="-"),sep="_")
indx2 <- paste("Col", paste(seq(p1%%n1+1, ncol(m1)),collapse="-"), sep="_")
c(rowMeans(matrix(m1[1:n2], nrow=p1%/%n1, byrow=TRUE, dimnames=list(indx1, NULL))), setNames(mean(m1[(n2+1):p1]), indx2))
# Col_1-5 Col_6-10 Col_11-15 Col_16-20 Col_21-22
# 25.5 75.5 25.5 75.5 10.5
Update
I realized that you wanted the rowMeans by splitting up columns 1:5, 6:10, 11:15 etc. If that is the case:
res1 <- cbind( colMeans(aperm(array(m1[1:n2], dim=c(nrow(m1), n, p1%/%n1)), c(2,1,3))),
rowMeans(m1[,(ncol(m1)-ncol(m1)%%n+1):ncol(m1)]))
which is equal to manual splitting the columns
res2 <- cbind(rowMeans(m1[,1:5]), rowMeans(m1[,6:10]), rowMeans(m1[,11:15]),
rowMeans(m1[,16:20]), rowMeans(m1[,21:22]))
identical(res1,res2)
#[1] TRUE
colnames(res1) <- c(indx1,indx2)
res1
# Col_1-5 Col_6-10 Col_11-15 Col_16-20 Col_21-22
#[1,] 21 71 21 71 6
#[2,] 22 72 22 72 7
#[3,] 23 73 23 73 8
#[4,] 24 74 24 74 9
#[5,] 25 75 25 75 10
#[6,] 26 76 26 76 11
#[7,] 27 77 27 77 12
#[8,] 28 78 28 78 13
#[9,] 29 79 29 79 14
#[10,] 30 80 30 80 15

Related

Find pattern and filter the start location

I would like to find the location of a pattern AND filter the location.
I'm looking for a function to return the start location of the pattern "gaaa" between 30 and 34 for each row.
I explain, for the moment here is what I have as a result with the function str_locate_all :
library(stringr)
Sequence <- data.frame(All = c("ggcgaagcagugcucccaguguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"aggacaacucgcuccacggccguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"cugaaauggcagcagaaacguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcaacaaa",
"ggucaaagaggaggagcucguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu"))
str_locate_all(pattern = 'gaaa', Sequence$All)
[[1]]
start end
[1,] 33 36
[2,] 73 76
[[2]]
start end
[1,] 34 37
[2,] 74 77
[[3]]
start end
[1,] 3 6
[2,] 15 18
[3,] 32 35
[4,] 72 75
[[4]]
start end
[1,] 32 35
[2,] 72 75
Here is what I would like to have as a result:
start
1 33
2 34
3 32
4 32
Thanks you!
Here is a way. It uses the output of the str_locate_all instruction in the question and filters it in a lapply loop.
found <- str_locate_all(pattern = 'gaaa', Sequence$All)
found <- lapply(found, function(x){
y <- x[, 'start']
data.frame(start = y[y >= 30 & y <= 34])
})
do.call(rbind, found)
# start
#1 33
#2 34
#3 32
#4 32
Here is another way. It searches only a sub string of the original string.
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
#Timings
Here are the timings of the 3 answers available so far, r2evans, mine and tmfmnk.
I only post the results with larger input, since that's what should make the timings important.
library(stringr)
library(dplyr)
library(purrr)
r2evans <- function(){
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
}
rui <- function(){
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
}
tmfmnk <- function(){
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
}
library(microbenchmark)
for(i in 1:8) Sequence <- rbind(Sequence, Sequence)
dim(Sequence)
#[1] 1024 1
mb <- microbenchmark(
revans = f1(),
rui = f2()
tmfmnk = f3()
)
print(mb, unit = 'relative', order = 'median')
#Unit: relative
# expr min lq mean median uq max neval
# rui 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
# r2evans 19.66135 17.52724 16.28008 15.47317 16.20747 5.60779 100
# tmfmnk 1529.51644 1235.86285 1079.56958 1073.49131 1072.39265 317.95638 100
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
# start
# 1 33
# 2 34
# 3 32
# 4 32
One dplyr and purrr solution could be:
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
ID start end
1 1 33 36
2 2 34 37
3 3 32 35
4 4 32 35

Identify sequences between identical values

I have a large matrix:
id v1 v2 v3 v4 v5 v6 v7 v8
1001 37 15 30 37 4 11 35 37
2111 44 31 44 30 24 39 44 18
3121 43 49 39 34 44 43 26 24
4532 45 31 26 33 12 47 37 15
5234 23 27 34 23 30 34 23 4
6345 9 46 39 34 8 43 26 24
For each row (id), I would like to identify intervals of numbers in column v1 to v8. An interval is here defined as a sequence of numbers which starts and ends with the same number.
For example, in the first row, there are two sequences which both start and ends with 37: From column 1 to 4 (37, 15, 30, 37) and from column 4 to column 8 (37, 4, 11, 35, 37).
The focal value should only occur in start and end positions. For example, in the first row, the sequence from 37 at V1, to 37 at V8 is not included, because 37 also occurs in V4.
For each interval, I want the index of the start and end columns, the focal start and end value, and the sequence of numbers in between.
Desired output:
1001 [v1] to [v4] 37 to 37: 15,30
1001 [v4] to [v8] 37 to 37: 4, 11, 35
2111 [v1] to [v3] 44 to 44: 31
2111 [v3] to [v7] 44 to 44: 30, 24, 39
Any suggestions? Algorithm?
I managed to code for the indices for a vector not a matrix,
a <- which(x == 37)
from <- a[!(a-1) %in% a]
to <- a[!(a+1) %in% a]
rbind(from, to)
Very brute-force method. Get unique elements for the given row, check if they are present more than once but not side-by-side, then lapply through each, getting the elements of the row x between them.
apply(m, 1, function(x) {
u <- unique(x)
u <- u[sapply(u, function(u) any(diff(which(x == u)) > 1))]
lapply(setNames(u, u), function(u){
ind <- which(x == u)
lapply(seq(length(ind) - 1),
function(i) x[seq(ind[i] + 1, ind[i + 1] - 1)])
})
})
Output:
# [[1]]
# [[1]]$`37`
# [[1]]$`37`[[1]]
# [1] 15 30
#
# [[1]]$`37`[[2]]
# [1] 4 11 35
#
#
#
# [[2]]
# [[2]]$`44`
# [[2]]$`44`[[1]]
# [1] 31
#
# [[2]]$`44`[[2]]
# [1] 30 24 39
#
#
#
# [[3]]
# [[3]]$`43`
# [[3]]$`43`[[1]]
# [1] 49 39 34 44
#
#
#
# [[4]]
# named list()
#
# [[5]]
# [[5]]$`23`
# [[5]]$`23`[[1]]
# [1] 27 34
#
# [[5]]$`23`[[2]]
# [1] 30 34
#
#
# [[5]]$`34`
# [[5]]$`34`[[1]]
# [1] 23 30
#
#
#
# [[6]]
# named list()
Edit: Henrik's answer inspired me to do a join-based version
library(data.table)
library(magrittr)
d <- melt(as.data.table(m), "id", variable.name = 'ci')[, ci := rowid(id)]
setorder(d, id)
options(datatable.nomatch = 0)
d[d, on = .(id, value, ci > ci)
, .(id, value, i.ci, x.ci)
, mult = 'first'] %>%
.[d, on = .(id, i.ci < ci, x.ci > ci)
, .(id, value, from_ci = x.i.ci, to_ci = x.x.ci, i.value)] %>%
.[, .(val = .(i.value))
, by = setdiff(names(.), 'i.value')]
# id value from_ci to_ci val
# 1: 1001 37 1 4 15,30
# 2: 1001 37 4 8 4,11,35
# 3: 2111 44 1 3 31
# 4: 2111 44 3 7 30,24,39
# 5: 3121 43 1 6 49,39,34,44
# 6: 5234 23 1 4 27,34
# 7: 5234 34 3 6 23,30
# 8: 5234 23 4 7 30,34
Here's a data.table alternative.
Convert matrix to data.table and melt to long format. Create a column index 'ci' to keep track of the original columns (rowid(id)). Order by 'id'.
For each 'id' and 'value' (by = .(id, value)), check if number of rows is larger than one (if(.N > 1)), i.e. if there is at least one sequence. If so, grab the row index (.I) of the sequences and their column indexes (in the original data). For each sequence, grab the corresponding values between start and end index. Wrap in list twice (.(.() to create a list column.
library(data.table)
d <- melt(as.data.table(m), id.vars = "id")
d[ , `:=`(
ci = rowid(id),
variable = NULL)]
setorder(d, id)
d2 <- d[ , if(.N > 1){
.(from = .I[-.N], to = .I[-1],
from_ci = ci[-.N], to_ci = ci[ -1])
}, by = .(id, value)]
d2[ , val := .(.(d$value[seq(from + 1, to - 1)])), by = 1:nrow(d2)]
d2[ , `:=`(from = NULL, to = NULL)]
# id value from_ci to_ci val
# 1: 1001 37 1 4 15,30
# 2: 1001 37 4 8 4,11,35
# 3: 2111 44 1 3 31
# 4: 2111 44 3 7 30,24,39
# 5: 3121 43 1 6 49,39,34,44
# 6: 5234 23 1 4 27,34
# 7: 5234 23 4 7 30,34
# 8: 5234 34 3 6 23,30

Add number to vector repeatdly and duplicate vector

I have a two value
3 and 5
and I make vector
num1 <- 3
num2 <- 12
a <- c(num1, num2)
I want add number(12) to vector "a" and
also I want to make new vector with repeat and append
like this:
3,12, 15,24, 27,36, 39,48 ....
repeat number "n" is 6
I don't have any idea.
Here are two methods in base R.
with outer, you could do
c(outer(c(3, 12), (12 * 0:4), "+"))
[1] 3 12 15 24 27 36 39 48 51 60
or with sapply, you can explicitly loop through and calculate the pairs of sums.
c(sapply(0:4, function(i) c(3, 12) + (12 * i)))
[1] 3 12 15 24 27 36 39 48 51 60
outer returns a matrix where every pair of elements of the two vectors have been added together. c is used to return a vector. sapply loops through 0:4 and then calculates the element-wise sum. It also returns a matrix in this instance, so c is used to return a vector.
Here is a somewhat generic function that takes as input your original vector a, the number to add 12, and n,
f1 <- function(vec, x, n){
len1 <- length(vec)
v1 <- sapply(seq(n/len1), function(i) x*i)
v2 <- rep(v1, each = n/length(v1))
v3 <- rep(vec, n/len1)
return(c(vec, v3 + v2))
}
f1(a, 12, 6)
#[1] 3 12 15 24 27 36 39 48
f1(a, 11, 12)
#[1] 3 12 14 23 25 34 36 45 47 56 58 67 69 78
f1(a, 3, 2)
#[1] 3 12 6 15
EDIT
If by n=6 you mean 6 times the whole vector then,
f1 <- function(vec, x, n){
len1 <- length(vec)
v1 <- sapply(seq(n), function(i) x*i)
v2 <- rep(v1, each = len1)
v3 <- rep(vec, n)
return(c(vec, v3 + v2))
}
f1(a, 12, 6)
#[1] 3 12 15 24 27 36 39 48 51 60 63 72 75 84
Using rep for repeating and cumsum for the addition:
n = 6
rep(a, n) + cumsum(rep(c(12, 0), n))
# [1] 15 24 27 36 39 48 51 60 63 72 75 84

R - List of combinations with outer() and expand.grid()

I have a list of prime numbers with I multiply using outer() and upper.tri() to get a unique set of numbers.
primes <- c(2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
m <- outer(primes, primes, "*")
unq <- m[which(upper.tri(m))]
> unq
6 10 15 14 21 35 22 33 55 77 26 39 65 91 143 34 51 85 119 187 221 38 57 95 133 209 247 323 46 69 115 161 253 299 391 437 58 87 145 203 319 377 493 551 667
Each of the original prime numbers represents a set of two numbers:
a2 <- c(1,1)
a3 <- c(1,2)
a5 <- c(2,2)
a7 <- c(1,3)
a11 <- c(1,4)
a13 <- c(2,3)
a17 <- c(2,4)
a19 <- c(3,3)
a23 <- c(3,4)
a29 <- c(4,4)
The combination of the two sets of two numbers produces 4 numbers
expand.grid(a2,a3)
1 1
1 1
1 2
1 2
So what I would like to do is have a kind of a list of lists, with each prime number having all 4 possible combinations.
I tried something like this, but I am missing some fundamentals here:
outer(a ,a , "expand.grid")
So the result would look something like this for the first prime:
6 c(11, 11, 12, 12)
I'm not sure I understand correctly, but I hope this helps:
#function to `outer`
fun <- function(x, y)
{
a1 <- get(paste0("a", x))
a2 <- get(paste0("a", y))
res <- apply(expand.grid(a1, a2), 1, paste, collapse = "")
res2 <- paste(res, collapse = ";")
return(res2)
}
#`outer` a vectorized `fun`
m2 <- outer(primes, primes, Vectorize(fun))
#select `upper.tri`
unq2 <- m2[upper.tri(m2)]
#combine to a list
myls <- lapply(as.list(unq2), function(x) as.numeric(unlist(strsplit(x, ";"))))
names(myls) <- unq
myls
#$`6`
#[1] 11 11 12 12
#$`10`
#[1] 12 12 12 12
#$`15`
#[1] 12 22 12 22
#$`14`
#[1] 11 11 13 13
#...

Move cells to new column at each 48th row

I have list with names in A1:A144 and I want to move A49:A96 to B1:B48 and A97:144 to C1:C48.
So for each 48th row, I want the next 48 rows moved to a new column.
How to do that?
If you want to consider a VBA alternative then:
Sub MoveData()
nF = 1
nL = 48
nSize = Cells(Rows.Count, "A").End(xlUp).Row
nBlock = nSize / nL
For k = 1 To nBlock
nF = nF + 48
nL = nL + 48
Range("A" & nF & ":A" & nL).Copy Cells(1, k + 1)
Range("A" & nF & ":A" & nL).ClearContents
Next k
End Sub
Not sure how scalable this solution is, but it does work.
First let's pretend your names are x and you want the solution to be in new.df
number.shifts <- ceiling(length(x) / 48) # work out how many columns we need
# create an empty (NA) data frame with the dimensions we need
new.df <- matrix(data = NA, nrow = length(x), ncol = number.shifts)
# run a for-loop over the x, shift the column over every 48th row
j <- 1
for (i in 1:length(x)){
if (i %% 48 == 0) {j <- j + 1}
new.df[i,j] <- x[i]
}
I think you have to elaborate on your question a little more. Do you have the data in R or in Excel and do you want the output to be in R or in Excel?
That beeing said, if x is your vector indicating clusters
x <- rep(1:3, each = 48)
and y is the variable containing names or whatever that you want to distribute over columns A:C (each having 48 rows),
y <- sample(letters, 3 * 48, replace = TRUE)
you can do this:
y.wide <- do.call(cbind, split(y, x))
Just as there is stack in R to create a very long representation of a group of columns, there is unstack to take a long column and make it into a wide form.
Here's a basic example:
mydf <- data.frame(A = 1:144)
mydf$groups <- paste0("A", gl(n=3, k=48)) ## One of many ways to create groups
mydf2 <- unstack(mydf)
head(mydf2)
# A1 A2 A3
# 1 1 49 97
# 2 2 50 98
# 3 3 51 99
# 4 4 52 100
# 5 5 53 101
# 6 6 54 102
tail(mydf2)
# A1 A2 A3
# 43 43 91 139
# 44 44 92 140
# 45 45 93 141
# 46 46 94 142
# 47 47 95 143
# 48 48 96 144

Resources