N <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
dat1 <- data.frame(N,May = a, April = b,June = w)
N May April June
1 1 3 4 5
2 3 4 5 6
3 4 5 6 7
4 6 6 7 6
I need a data frame, where each value is sd of N value and row value
sd(c(1,3) sd(c(1,4) sd(c(1,5) # for 1st row
sd(c(3,4) sd(c(3,5) sd(c(3,6) # for second and so on.
Try this:
The data:
Norm <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
mydata <- data.frame(Norm=Norm,May = a, April = b,June = w)
Solution:
finaldata <- do.call('cbind',lapply(names(mydata)[2:4], function(x) apply(mydata[c("Norm",x)],1,sd)))
I hope it helps.
Piece of advice:
Please refrain from using names like data and norm for your variable names. They can easily conflict with things that are native to R. For example norm is a function in R, and so is data.
I think I got it
x=matrix(data=NA, nrow=4, ncol=3)
for(j in 1:3){
for(i in 1:4){
x[i, j] <- sd(data[i, c(i,(j+1))])
x
}
}
Related
I have two data frames of same number of columns (but not rows) df1 and df2. For each row in df2, I was able to find the best (and second best) matching rows from df1 in terms of hamming distance, in my previous post. In that post, we have been using the following example data:
set.seed(0)
df1 <- as.data.frame(matrix(sample(1:10), ncol = 2)) ## 5 rows 2 cols
df2 <- as.data.frame(matrix(sample(1:6), ncol = 2)) ## 3 rows 2 cols
I now need to compute the number of bits equal to 1 for:
each row in df2
the best matching rows in df1
the second matching rows in df1
The number of bits equal to 1 of an integer a maybe computed as
sum(as.integer(intToBits(a)))
And I have applied this to #ZheyuanLi's original function, so I have got item 1>. However I'm unable to apply the same logic to get item 2> and 3>, by simple modification of #ZheyuanLi's function.
Below are the functions from #ZheyuanLi's with modification:
hmd <- function(x,y) {
rawx <- intToBits(x)
rawy <- intToBits(y)
nx <- length(rawx)
ny <- length(rawy)
if (nx == ny) {
## quick return
return (sum(as.logical(xor(rawx,rawy))))
} else if (nx < ny) {
## pivoting
tmp <- rawx; rawx <- rawy; rawy <- tmp
tmp <- nx; nx <- ny; ny <- tmp
}
if (nx %% ny) stop("unconformable length!") else {
nc <- nx / ny ## number of cycles
return(unname(tapply(as.logical(xor(rawx,rawy)), rep(1:nc, each=ny), sum)))
}
}
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb[i] <- set.bits
k <- k + p
}
## recode "id", "d" and "sb" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb <- as.data.frame(matrix(sb, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb) <- "set.bits.1"
list(id = id, d = d, sb = sb)
}
Running these gives:
> foo(df1, df2)
$id
min1 min2 ## row id for best/second best match in df1
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2 ## minimum 2 hamming distance
1 2 2
2 1 3
3 1 3
$sb
set.bits.1 ## number of bits equal to 1 for each row of df2
1 3
2 2
3 4
OK, after reading through while re-editing your question (many times!), I think I know what you want. Essentially we need change nothing to hmd(). Your required items 1>, 2>, 3> can all be computed after the for loop in foo().
To get item 1>, which you called sb, we can use a tapply(). However, your computation of sb along the for loop is fine, so I will not change it. In the following, I will demonstrate the basic procedure to get item 2> and item 3>.
The id vector inside foo() stores all matching rows in df1:
id <- c(1, 4, 2, 3, 5, 2)
so we can simply extract those rows of df1 (actually, columns of xt), to compute the number of bits equal to 1. As you can see, there are lots of duplicity in id, so we can only computes on unique(id):
id0 <- sort(unique(id))
## [1] 1 2 3 4 5
We now extract those subset columns of xt:
sub_xt <- xt[, id0]
## [,1] [,2] [,3] [,4] [,5]
## V1 9 3 10 5 6
## V2 2 4 8 7 1
To compute the number of bits equal to 1 for each column of sub_xt, we again use tapply() and vectorized approach.
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
## [1] 3 3 3 5 3
Now we need to map sbxt0 to sbxt:
sbxt <- sbxt0[match(id, id0)]
## [1] 3 5 3 3 3 3
Then we can convert sbxt to a data frame sb1:
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## min.1.set.bits.1 min.2.set.bits.1
## 1 3 5
## 2 3 3
## 3 3 3
Finally we can assemble these things up:
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb2 <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb2[i] <- set.bits
k <- k + p
}
## compute "sb1"
id0 <- sort(unique(id))
sub_xt <- xt[, id0]
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
sbxt <- sbxt0[match(id, id0)]
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## recode "id", "d" and "sb2" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb2 <- as.data.frame(matrix(sb2, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb2) <- "set.bits.1"
list(id = id, d = d, sb1 = sb1, sb2 = sb2)
}
Now, running foo(df1, df2) gives:
> foo(df1,df2)
$id
min.1 min.2
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2
1 2 2
2 1 3
3 1 3
$sb1
min.1.set.bits.1 min.2.set.bits.1
1 3 5
2 3 3
3 3 3
$sb2
set.bits.1
1 3
2 2
3 4
Note that I have renamed the sb you used to sb2.
I'm still getting to grips with R and have been set the task of specifically writing a function where if x and y are vectors:
x <- c(3,7,9)
y <- 20
...then all of x and multiples of x which are less than y need to be output in the form of a vector, e.g.:
v1 <- c(3,6,7,9,12,14,15,18)
But then within the function it needs to sum up all the numbers in the vector v1 - (3+6+...+15+18).
I've had a go at it but I can never really get my head around if else statements, so could anyone help me out and explain so I know for future reference?
No loops needed. Figure out how many times each x value goes into y, then generate a list of the unique numbers:
x <- c(3,7,9)
y <- 20
possible <- y %/% x
#[1] 6 2 2
out <- unique(sequence(possible) * rep(x,possible))
# or alternatively
# out <- unique(unlist(Map(function(a,b) sequence(a) * b, possible, x)))
out
#[1] 3 6 9 12 15 18 7 14
sum(out)
#[1] 84
Here's an example using basic loops and if else branching in R.
x <- c(3,7,9)
y1 <- 20
v1 <- numeric()
for(i in x){
nex <- i
counter <- 1
repeat{
if(!(nex %in% v1)){
v1 <- c(v1, nex)
}
counter <- counter + 1
nex <- i*counter
if(nex >= y1){
break
}
}
}
v1 <- sort(v1)
v1.sum <- sum(v1)
v1
## 3 6 7 9 12 14 15 18
v1.sum
## 84
Good morning,
I have the following problem.
My Data.frame "data" has the format:
Type amount
1 2
2 0
3 3
I would like to create a vector with the format:
1
1
3
3
3
This means I would like to transform my data.
I created a vector and wrote the following code for my transformation in R:
vector <- numeric(5)
for (i in 1:3){
k <- 1
while (k <= data[i,2]){
vector[k] <- data[i,1]
k <- k+1
}
}
The problem is, I get the following results and I have no Idea at which part I go wrong…
3
3
3
0
0
There might be many different ways in solving this particular problem in R but I am curious why my solution doesn't work. I am thankful for alternatives, but really would like to know what my mistake is.
Thank's for your help!
Try this solution:
df <- data.frame(type = c(1, 2, 3), amount = c(2, 0, 3))
result <- unlist(mapply(function(x, y) rep.int(x, y), df[, "type"], df[, "amount"]))
result
Output is following:
# [1] 1 1 3 3 3
Exaclty your code is buggy. Correct code should looks following:
df <- data.frame(type = c(1, 2, 3), amount = c(2, 0, 3))
vector <- numeric(5)
k <- 1
for (i in 1:3) {
j <- 1
while (j <= df[i, 2]) {
vector[k] <- df[i, 1]
k <- k + 1
j <- j + 1
}
}
vector
# [1] 1 1 3 3 3
Probably the fastest and most elegant way to obtain this result has been posted before in a comment by #akrun:
with(data, rep(Type, amount))
[1] 1 1 3 3 3
However, if you want to do this with for/while loops, it could be helpful to use a list for such cases, where the number of entries is not known at the beginning.
Here is an example with minimal modifications of your code:
my_list <- vector("list", 3)
for (i in 1:3) {
k <- 1
while (k <= data[i,2]){
my_list[[i]][k] <- data[i,1]
k <- k + 1
}
}
vector <- unlist(my_list)
#> vector
#[1] 1 1 3 3 3
The reason why your code didn't work was essentially that you were trying to put too much information into a single variable, k. It cannot serve as both, an index of your output vector, and as a counter for the individual entries in the first column of data; a counter which is reset to 1 each time the while loop has finished.
I would like to calculate name number for a set of given names.
Name number is calculated by summing the value assigned to each alphabet. The values are given below:
a=i=j=q=y=1
b=k=r=2
c=g=l=s=3
d=m=t=4
h=e=n=x=5
u=v=w=6
o=z=7
p=f=8
Example: Name number of David can be calculated as follows:
D+a+v+i+d
4+1+6+1+4
16=1+6=7
Name number of David is 7.
I would like to write a function in R for doing this.
I am thankful for any directions or tips or package suggestions that I should look into.
This code snippet will accomplish what you want:
# Name for which the number should be computed.
name <- "David"
# Prepare letter scores array. In this case, the score for each letter will be the array position of the string it occurs in.
val <- c("aijqy", "bkr", "cgls", "dmt", "henx", "uvw", "oz", "pf")
# Convert name to lowercase.
lName <- tolower(name)
# Compute the sum of letter scores.
s <- sum(sapply(unlist(strsplit(lName,"")), function(x) grep(x, val)))
# Compute the "number" for the sum of letter scores. This is a recursive operation, which can be shortened to taking the mod by 9, with a small correction in case the sum is 9.
n <- (s %% 9)
n <- ifelse(n==0, 9, n)
'n' is the result that you want for any 'name'
You will want to create a vector of values, in alphabetical order, then use match to get their indices. Something like this:
a <- i <- j <- q <- y <- 1
b <- k <- r <- 2
c <- g <- l <- s <- 3
d <- m <- t <- 4
h <- e <- n <- x <- 5
u <- v <- w <- 6
o <- z <- 7
p <- f <- 8
vals <- c(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z)
sum(vals[match(c("d","a","v","i","d"), letters)])
I'm sure there are several ways to do this, but here's an approach using a named vector:
x <- c(
"a"=1,"i"=1,"j"=1,"q"=1,"y"=1,
"b"=2,"k"=2,"r"=2,
"c"=3,"g"=3,"l"=3,"s"=3,
"d"=4,"m"=4,"t"=4,
"h"=5,"e"=5,"n"=5,"x"=5,
"u"=6,"v"=6,"w"=6,
"o"=7,"z"=7,
"p"=8,"f"=8)
##
name_val <- function(Name, mapping=x){
split <- tolower(unlist(strsplit(Name,"")))
total <-sum(mapping[split])
##
sum(as.numeric(unlist(strsplit(as.character(total),split=""))))
}
##
Names <- c("David","Betty","joe")
##
R> name_val("David")
[1] 7
R> sapply(Names,name_val)
David Betty joe
7 7 4
I would like to aggregate the rows of a matrix by adding the values in rows that have the same rowname. My current approach is as follows:
> M
a b c d
1 1 1 2 0
1 2 3 4 2
2 3 0 1 2
3 4 2 5 2
> index <- as.numeric(rownames(M))
> M <- cbind(M,index)
> Dfmat <- data.frame(M)
> Dfmat <- aggregate(. ~ index, data = Dfmat, sum)
> M <- as.matrix(Dfmat)
> rownames(M) <- M[,"index"]
> M <- subset(M, select= -index)
> M
a b c d
1 3 4 6 2
2 3 0 1 2
3 4 2 5 2
The problem of this appraoch is that i need to apply it to a number of very large matrices (up to 1.000 rows and 30.000 columns). In these cases the computation time is very high (Same problem when using ddply). Is there a more eficcient to come up with the solution? Does it help that the original input matrices are DocumentTermMatrix from the tm package? As far as I know they are stored in a sparse matrix format.
Here's a solution using by and colSums, but requires some fiddling due to the default output of by.
M <- matrix(1:9,3)
rownames(M) <- c(1,1,2)
t(sapply(by(M,rownames(M),colSums),identity))
V1 V2 V3
1 3 9 15
2 3 6 9
There is now an aggregate function in Matrix.utils. This can accomplish what you want with a single line of code and is about 10x faster than the combineByRow solution and 100x faster than the by solution:
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
> microbenchmark(a<-t(sapply(by(m,rownames(m),colSums),identity)),b<-combineByRow(m),c<-aggregate.Matrix(m,row.names(m)),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
a <- t(sapply(by(m, rownames(m), colSums), identity)) 6000.26552 6173.70391 6660.19820 6419.07778 7093.25002 7723.61642 10
b <- combineByRow(m) 634.96542 689.54724 759.87833 732.37424 866.22673 923.15491 10
c <- aggregate.Matrix(m, row.names(m)) 42.26674 44.60195 53.62292 48.59943 67.40071 70.40842 10
> identical(as.vector(a),as.vector(c))
[1] TRUE
EDIT: Frank is right, rowsum is somewhat faster than any of these solutions. You would want to consider using another one of these other functions only if you were using a Matrix, especially a sparse one, or if you were performing an aggregation besides sum.
The answer by James work as expected, but is quite slow for large matrices. Here is a version that avoids creating of new objects:
combineByRow <- function(m) {
m <- m[ order(rownames(m)), ]
## keep track of previous row name
prev <- rownames(m)[1]
i.start <- 1
i.end <- 1
## cache the rownames -- profiling shows that it takes
## forever to look at them
m.rownames <- rownames(m)
stopifnot(all(!is.na(m.rownames)))
## go through matrix in a loop, as we need to combine some unknown
## set of rows
for (i in 2:(1+nrow(m))) {
curr <- m.rownames[i]
## if we found a new row name (or are at the end of the matrix),
## combine all rows and mark invalid rows
if (prev != curr || is.na(curr)) {
if (i.start < i.end) {
m[i.start,] <- apply(m[i.start:i.end,], 2, max)
m.rownames[(1+i.start):i.end] <- NA
}
prev <- curr
i.start <- i
} else {
i.end <- i
}
}
m[ which(!is.na(m.rownames)),]
}
Testing it shows that is about 10x faster than the answer using by (2 vs. 20 seconds in this example):
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
start <- proc.time()
m1 <- combineByRow(m)
print(proc.time()-start)
start <- proc.time()
m2 <- t(sapply(by(m,rownames(m),function(x) apply(x, 2, max)),identity))
print(proc.time()-start)
all(m1 == m2)