I have a list of chromosomes chromosomes <- c(1:2, "X", "Y") that I am iterating over to generate random data n times for each chromosome.
I am doing this first by iterating over the chromosomes and generating the data using generateData() and then adding these to a list which I then combine into a data frame outside of the loop using bp_data <- as.data.frame(do.call(rbind, simByChrom)):
chromosomes <- c(1:2, "X", "Y")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
bp_data <- generateData(c, n)
simByChrom[[c]] <- bp_data
}
bp_data <- as.data.frame(do.call(rbind, simByChrom))
rownames(bp_data) <- NULL
# generate dummy data
generateData <- function(c, n){
df <- data.frame(chrom = rep(c, n),
pos= sample(1:10000, n))
return(df)
}
chrom pos
1 1 7545
2 2 5798
3 2 3863
4 3 4036
5 3 9347
6 3 4749
I would like to iterate over this multiple times and record the iteration number in bp_data$iteration, to produce a data frame that looks like this:
chrom pos iteration
1 7215 1
1 4606 1
2 8282 1
2 3501 1
2 4350 1
2 6044 1
X 2467 1
Y 2816 1
Y 8848 1
Y 2304 1
Y 4235 1
1 3760 2
1 8205 2
1 4735 2
2 3061 2
X 56 2
X 1722 2
X 2430 2
X 6749 2
X 2081 2
Y 9646 2
However, I'm unsure how to do this. I've tried:
iterations <- 2
for (i in (1:iterations)){
cat("Running iteration", i, "\n")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
bp_data <- generateData(c, n)
bp_data$iteration <- i
simByChrom[[c]] <- bp_data
# or
# simByChrom[[c]][[i]] <- bp_data
# or
# simByChrom[[c]] <- bp_data
# simByChrom[[c]]$iteration <- i
}
bp_data <- as.data.frame(do.call(rbind, simByChrom))
rownames(bp_data) <- NULL
}
But this results in only the last iteration being recorded.
Can anyone suggest how I can achieve my desired result?
The reason you are only seeing the last iteration in your result is because bp_data is being over-written each time through the for loop. You need to make sure you save each iteration result separately and then combine them together at the end.
I believe just a few minor adjustments to what you already have will do the trick:
iterations <- 2
#create empty list to store each iteration result
bp_data <- list()
#run each iteration
for (i in 1:iterations){
cat("Running iteration", i, "\n")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
aa <- generateData(c, n)
aa$iteration <- i
simByChrom[[c]] <- aa
}
result <- as.data.frame(do.call(rbind, simByChrom))
rownames(result) <- NULL
bp_data[[i]] <- result
}
#combine each iteration into one data frame
final <- as.data.frame(do.call(rbind, bp_data))
Related
I need to run this function like 6000 times with all of its iterations. I have 6 arguments in total for the function. The first 3 of them go hand in hand and number 75. The next argument has 9 values. And the last 2 arguments have 3 values.
#require dplyr
#data is history as list
matchloop <- function(data, data2, x, a, b, c) {
#history as list
split <- data
#history for reference
fh <- FullHistory
#start counter
n<-1
#end counter
m<-a
tempdf0.3 <- fh
#set condition for loop
while(nrow(tempdf0.3) > 1 && m <= (nrow(data2))*b) {
#put history into a variable
tempdf0.0 <- split
#put fh into a variable
tempdf0.5 <- fh
#put test path into variable from row n to m
tempdf0.1 <- as.data.frame(data2[n:m,], stringsAsFactors = FALSE)
#change column name of test path
colnames(tempdf0.1) <- "directions"
#put row n to m of history into variable
tempdf0.2 <- lapply(tempdf0.0, function(df) df[n:m,])
#put output into output
tempdf0.3 <- orderedDistancespos(tempdf0.2, tempdf0.1,
"allPaths","directions")
#add to output routeID based on reference from fh-the test path ID
tempdf0.3 <- mutate(tempdf0.3, routeID = (subset(tempdf0.5, routeID
!= x)$routeID))
#reduce output based on the matched threshold
tempdf0.3 <- subset(tempdf0.3, dists >= a*c)
#create new history based on the IDs remaining in output
split <- split[as.character(tempdf0.3$routeID)]
#create new history for reference based on the IDs remaining in
output
fh <- subset(fh, routeID %in% tempdf0.3$routeID)
#increase loop counter
n <- n+a
#increase loop counter
m <- n+(a-1)
}
#show output
mylist <- list(tempdf0.3, nrow(tempdf0.3))
return(mylist)
}
I tried putting the 3 arguments with 75 elements in them to their own lists and use mapply. This works. But even at this level I still have to run the code 81 times to cover all the variables because as far as I understand mapply recycles based on the length of the longest argument.
mapply(matchloop, mylist2,mylist3,mylist4, MoreArgs = list(a=a, b=b, c=c))
data is a list of dataframes
data2 is a dataframe
x, a, b, c are all numerical.
Right now I'm trying to streamline my output so that its in just 1 line. So if possible I would like 1 single csv output with all 6000+ lines.
You can combine mapply and apply function to cycle through all possible combination of a, b and c variables. To create all possible combinations you can use expand.grid. Finally you can contatenate list of rows into a data.frame with the help of do.call and rbind functions as follows:
matchloop_stub <- matchloop <- function(data, data2, x, a, b, c) {
# stub
c(d = sum(data), d2 = sum(data2), x = sum(x), a = a, b = b, c = c, r = a + b + c)
}
set.seed(123)
mylist2 <- replicate(75, data.frame(rnorm(1)))
mylist3 <- replicate(75, data.frame(rnorm(2)))
mylist4 <- replicate(75, data.frame(rnorm(3)))
a <- 1:9
b <- 1:3
c <- 1:3
abc <- expand.grid(a, b, c)
names(abc) <- c("a", "b", "c")
xs <- apply(abc, 1, function(x) (mapply(matchloop_stub, mylist2, mylist3, mylist4, x[1], x[2], x[3], SIMPLIFY = FALSE)))
df <- do.call(rbind, do.call(rbind, xs))
write.csv(df, file = "temp.csv")
res <- read.csv("temp.csv")
nrow(res)
# [1] 6075
head(res)
# X d d2 x a b c r
# 1 1 -0.5604756 0.7407984 -1.362065 1 1 1 3
# 2 2 -0.5604756 0.7407984 -1.362065 2 1 1 4
# 3 3 -0.5604756 0.7407984 -1.362065 3 1 1 5
# 4 4 -0.5604756 0.7407984 -1.362065 4 1 1 6
# 5 5 -0.5604756 0.7407984 -1.362065 5 1 1 7
# 6 6 -0.5604756 0.7407984 -1.362065 6 1 1 8
some other languages have this:
i=1
x&i=3
Then you will get a variable x1=3
How to realize this in R?
please don't use assign(paste0('x',1),3).
Because I want to iterate i, for example:
x1=c()
for(i in 1:100){
x1=c(x1,2*i)}
And I want x1,x2...xn. assign(paste) only generates variables once and doesn't have adding functions.
So the grammar x&i is the core problem.
Thanks for help.
Try this:
e <- .GlobalEnv
i <- 1
xi.name <- paste0("x", i)
# assign
e[[xi.name]] <- 3
# add
e[[xi.name]] <- e[[xi.name]] + 1
# display
e[[xi.name]]
## [1] 4
or using assign and get the above could be done like this:
i <- 1
xi.name <- paste0("x", i)
# assign
assign(xi.name, 3)
# add
assign(xi.name, get(xi.name) + 1)
# display
get(xi.name)
## [1] 4
Note that normally one does not generate dynamic variables but rather puts them into a list.
L <- list()
i <- 1
xi.name <- paste0("x", i)
# assign
L[[xi.name]] <- 3
# add
L[[xi.name]] <- L[[xi.name]] + 1
# display
L[[xi.name]]
## [1] 4
or simply:
L <- list()
i <- 1
# assign
L[[i]] <- 3
# add
L[[i]] <- L[[i]] + 1
# display
L[[i]]
## [1] 4
Note
e <- .GlobalEnv
i <- 1
xi.name <- paste0("x", i)
x1 <- 3
e[[xi.name]] <- c(e[[xi.name]], 99)
x1
## [1] 3 99
e <- .GlobalEnv
i <- 1
xi.name <- paste0("x", i)
x1 <- 3
assign(xi.name, c(get(xi.name), 99))
x1
## [1] 3 99
Given a vector v of F non-negative integers, I want to create, one by one, all possible sets of K vectors with size F whose sum is v. I call C the matrix of these K vectors; the row sum of C gives v.
For instance, the vector (1,2) of size F=2, if we set K=2, can be decomposed in:
# all sets of K vectors such that their sum is (1,2)
C_1 = 1,0 C_2 = 1,0 C_3 = 1,0 C_4 = 0,1 C_5 = 0,1 C_6 = 0,1
2,0 1,1 0,2 2,0 1,1 0,2
The goal is to apply some function to each possible C. Currently, I use this code, where I pre-compute all possible C and then go through them.
library(partitions)
K <- 3
F <- 5
v <- 1:F
partitions <- list()
for(f in 1:F){
partitions[[f]] <- compositions(n=v[f],m=K)
}
# Each v[f] has multiple partitions. Now we create an index to consider
# all possible combinations of partitions for the whole vector v.
npartitions <- sapply(partitions, ncol)
indices <- lapply(npartitions, function(x) 1:x)
grid <- as.matrix(do.call(expand.grid, indices)) # breaks if too big
for(n in 1:nrow(grid)){
selected <- c(grid[n,])
C <- t(sapply(1:F, function(f) partitions[[f]][,selected[f]]))
# Do something with C
#...
print(C)
}
However, when the dimensions are too big, of F, K are large, then the number of combinations explodes and expand.grid can't deal with that.
I know that, for a given position v[f], I can create a partition at a time
partition <- firstcomposition(n=v[f],m=K)
nextcomposition(partition, v[f],m=K)
But how can I use this to generate all possible C as in the above code?
npartitions <- ......
indices <- lapply(npartitions, function(x) 1:x)
grid <- as.matrix(do.call(expand.grid, indices))
You can avoid the generation of grid and successively generate its rows thanks to a Cantor expansion.
Here is the function returning the Cantor expansion of the integer n:
aryExpansion <- function(n, sizes){
l <- c(1, cumprod(sizes))
nmax <- tail(l,1)-1
if(n > nmax){
stop(sprintf("n cannot exceed %d", nmax))
}
epsilon <- numeric(length(sizes))
while(n>0){
k <- which.min(l<=n)
e <- floor(n/l[k-1])
epsilon[k-1] <- e
n <- n-e*l[k-1]
}
return(epsilon)
}
For example:
expand.grid(1:2, 1:3)
## Var1 Var2
## 1 1 1
## 2 2 1
## 3 1 2
## 4 2 2
## 5 1 3
## 6 2 3
aryExpansion(0, sizes = c(2,3)) + 1
## [1] 1 1
aryExpansion(1, sizes = c(2,3)) + 1
## [1] 2 1
aryExpansion(2, sizes = c(2,3)) + 1
## [1] 1 2
aryExpansion(3, sizes = c(2,3)) + 1
## [1] 2 2
aryExpansion(4, sizes = c(2,3)) + 1
## [1] 1 3
aryExpansion(5, sizes = c(2,3)) + 1
## [1] 2 3
So, instead of generating grid:
npartitions <- ......
indices <- lapply(npartitions, function(x) 1:x)
grid <- as.matrix(do.call(expand.grid, indices))
for(n in 1:nrow(grid)){
selected <- grid[n,]
......
}
you can do:
npartitions <- ......
for(n in seq_len(prod(npartitions))){
selected <- 1 + aryExpansion(n-1, sizes = npartitions)
......
}
I have two data frames of same number of columns (but not rows) df1 and df2. For each row in df2, I was able to find the best (and second best) matching rows from df1 in terms of hamming distance, in my previous post. In that post, we have been using the following example data:
set.seed(0)
df1 <- as.data.frame(matrix(sample(1:10), ncol = 2)) ## 5 rows 2 cols
df2 <- as.data.frame(matrix(sample(1:6), ncol = 2)) ## 3 rows 2 cols
I now need to compute the number of bits equal to 1 for:
each row in df2
the best matching rows in df1
the second matching rows in df1
The number of bits equal to 1 of an integer a maybe computed as
sum(as.integer(intToBits(a)))
And I have applied this to #ZheyuanLi's original function, so I have got item 1>. However I'm unable to apply the same logic to get item 2> and 3>, by simple modification of #ZheyuanLi's function.
Below are the functions from #ZheyuanLi's with modification:
hmd <- function(x,y) {
rawx <- intToBits(x)
rawy <- intToBits(y)
nx <- length(rawx)
ny <- length(rawy)
if (nx == ny) {
## quick return
return (sum(as.logical(xor(rawx,rawy))))
} else if (nx < ny) {
## pivoting
tmp <- rawx; rawx <- rawy; rawy <- tmp
tmp <- nx; nx <- ny; ny <- tmp
}
if (nx %% ny) stop("unconformable length!") else {
nc <- nx / ny ## number of cycles
return(unname(tapply(as.logical(xor(rawx,rawy)), rep(1:nc, each=ny), sum)))
}
}
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb[i] <- set.bits
k <- k + p
}
## recode "id", "d" and "sb" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb <- as.data.frame(matrix(sb, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb) <- "set.bits.1"
list(id = id, d = d, sb = sb)
}
Running these gives:
> foo(df1, df2)
$id
min1 min2 ## row id for best/second best match in df1
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2 ## minimum 2 hamming distance
1 2 2
2 1 3
3 1 3
$sb
set.bits.1 ## number of bits equal to 1 for each row of df2
1 3
2 2
3 4
OK, after reading through while re-editing your question (many times!), I think I know what you want. Essentially we need change nothing to hmd(). Your required items 1>, 2>, 3> can all be computed after the for loop in foo().
To get item 1>, which you called sb, we can use a tapply(). However, your computation of sb along the for loop is fine, so I will not change it. In the following, I will demonstrate the basic procedure to get item 2> and item 3>.
The id vector inside foo() stores all matching rows in df1:
id <- c(1, 4, 2, 3, 5, 2)
so we can simply extract those rows of df1 (actually, columns of xt), to compute the number of bits equal to 1. As you can see, there are lots of duplicity in id, so we can only computes on unique(id):
id0 <- sort(unique(id))
## [1] 1 2 3 4 5
We now extract those subset columns of xt:
sub_xt <- xt[, id0]
## [,1] [,2] [,3] [,4] [,5]
## V1 9 3 10 5 6
## V2 2 4 8 7 1
To compute the number of bits equal to 1 for each column of sub_xt, we again use tapply() and vectorized approach.
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
## [1] 3 3 3 5 3
Now we need to map sbxt0 to sbxt:
sbxt <- sbxt0[match(id, id0)]
## [1] 3 5 3 3 3 3
Then we can convert sbxt to a data frame sb1:
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## min.1.set.bits.1 min.2.set.bits.1
## 1 3 5
## 2 3 3
## 3 3 3
Finally we can assemble these things up:
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb2 <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb2[i] <- set.bits
k <- k + p
}
## compute "sb1"
id0 <- sort(unique(id))
sub_xt <- xt[, id0]
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
sbxt <- sbxt0[match(id, id0)]
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## recode "id", "d" and "sb2" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb2 <- as.data.frame(matrix(sb2, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb2) <- "set.bits.1"
list(id = id, d = d, sb1 = sb1, sb2 = sb2)
}
Now, running foo(df1, df2) gives:
> foo(df1,df2)
$id
min.1 min.2
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2
1 2 2
2 1 3
3 1 3
$sb1
min.1.set.bits.1 min.2.set.bits.1
1 3 5
2 3 3
3 3 3
$sb2
set.bits.1
1 3
2 2
3 4
Note that I have renamed the sb you used to sb2.
N <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
dat1 <- data.frame(N,May = a, April = b,June = w)
N May April June
1 1 3 4 5
2 3 4 5 6
3 4 5 6 7
4 6 6 7 6
I need a data frame, where each value is sd of N value and row value
sd(c(1,3) sd(c(1,4) sd(c(1,5) # for 1st row
sd(c(3,4) sd(c(3,5) sd(c(3,6) # for second and so on.
Try this:
The data:
Norm <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
mydata <- data.frame(Norm=Norm,May = a, April = b,June = w)
Solution:
finaldata <- do.call('cbind',lapply(names(mydata)[2:4], function(x) apply(mydata[c("Norm",x)],1,sd)))
I hope it helps.
Piece of advice:
Please refrain from using names like data and norm for your variable names. They can easily conflict with things that are native to R. For example norm is a function in R, and so is data.
I think I got it
x=matrix(data=NA, nrow=4, ncol=3)
for(j in 1:3){
for(i in 1:4){
x[i, j] <- sd(data[i, c(i,(j+1))])
x
}
}