Related
What I would like to do is for the red points find the nearest equivalent blue dot on the other side of the abline (i.e. 1,5 find 5,1).
Data:
https://1drv.ms/f/s!Asb7WztvacfOuesIq4evh0jjvejZ4Q
Edit: to open data do readRDS("path/to/data")
So what I have tried is to find the difference between the x and y coordinates, rank them and then find the min value going down the ranks for both x and y. The results and pretty bad. The thing I'm struggling with is finding a way to find nearest match of tuples.
My attempt:
find_nearest <- function(query, subject){
weight_df <- data.frame(ID=query$ID)
#find difference of first, then second, rank and find match in both going from top to bottom
tmp_df <- query
for(i in 1:nrow(subject)){
first_order <- order(abs(query$mean_score_n-subject$mean_score_n[i]))
second_order <- order(abs(query$mean_score_p-subject$mean_score_p[i]))
tmp_df$order_1[first_order] <- seq(1, nrow(tmp_df))
tmp_df$order_2[second_order] <- seq(1, nrow(tmp_df))
weight_df[,i+1] <- tmp_df$order_1 + tmp_df$order_2
}
rownames(weight_df) <- weight_df$ID
weight_df$ID <- NULL
print(dim(weight_df))
nearest_match <- list()
count <- 1
subject_ids <- NA
query_ids <- NA
while(ncol(weight_df) > 0 & count <= ncol(weight_df)){
pos <- which(weight_df == min(weight_df, na.rm = TRUE), arr.ind = TRUE)
if(length(unique(rownames(pos))) > 1){
for(i in nrow(pos)){
#if subject/query already used then mask and find another
if(subject$ID[pos[i,2]] %in% subject_ids){
weight_df[pos[i,1],pos[i,2]] <- NA
}else if(query$ID[pos[i,1]] %in% query_ids){
weight_df[pos[i,1],pos[i,2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[i,2]])
query_ids <- c(query_ids, query$ID[pos[i,1]])
nearest_match[[count]] <- data.frame(query=query[pos[i,1],]$ID, subject=subject[pos[i,2],]$ID)
#mask
weight_df[pos[i,1],pos[i,2]] <- NA
count <- count + 1
}
}
}else if(nrow(pos) > 1){
#if subject/query already used then mask and find another
if(subject$ID[pos[1,2]] %in% subject_ids){
weight_df[pos[1,1],pos[1,2]] <- NA
}else if(query$ID[pos[1,1]] %in% query_ids){
weight_df[pos[1,1],pos[1,2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[1,1]])
query_ids <- c(query_ids, query$ID[pos[1,1]])
nearest_match[[count]] <- data.frame(query=query[pos[1,1],]$ID, subject=subject[pos[1,2],]$ID)
#mask
weight_df[pos[1,1],pos[1,2]] <- NA
count <- count + 1
}
}else{
#if subject/query already used then mask and find another
if(subject$ID[pos[2]] %in% subject_ids){
weight_df[pos[1],pos[2]] <- NA
}else if(query$ID[pos[1]] %in% query_ids){
weight_df[pos[1],pos[2]] <- NA
}else{
subject_ids <- c(subject_ids, subject$ID[pos[2]])
query_ids <- c(query_ids, query$ID[pos[1]])
nearest_match[[count]] <- data.frame(query=query[pos[1],]$ID, subject=subject[pos[2],]$ID)
#mask
weight_df[pos[1],pos[2]] <- NA
count <- count + 1
}
}
}
out <- plyr::ldply(nearest_match, rbind)
out <- merge(out, data.frame(subject=subject$ID,
mean_score_p_n=subject$mean_score_p,
mean_score_n_n= subject$mean_score_n), by="subject", all.x=TRUE)
out <- merge(out, data.frame(query=query$ID,
mean_score_p_p=query$mean_score_p,
mean_score_n_p= query$mean_score_n), by="query", all.x=TRUE)
return(out)
}
Edit: is this what the solution looks like for you?
ggplot() +
geom_point(data=B[out,], aes(x=mean_score_p, y= mean_score_n, color="red")) +
geom_point(data=A, aes(x=mean_score_p, y=mean_score_n, color="blue")) +
geom_abline(intercept = 0, slope = 1)
Let
query <- readRDS("query.dms")
subject <- readRDS("subject.dms")
kA <- nrow(subject)
kB <- nrow(query)
A <- as.matrix(subject[, 2:3])
B <- as.matrix(query[, 2:3])
where we want to find the closest "reverse" point (row) in B to each point in A.
Solution permitting non-unique results
Then, assuming that you are using the Euclidean distance,
D <- as.matrix(dist(rbind(A, B[, 2:1])))[(1 + kA):(kA + kB), 1:kA]
unname(apply(D, 2, which.min))
# [1] 268 183 350 284 21 360 132 287 100 298 58 56 170 70 47 305 353
# [18] 43 266 198 58 215 198 389 412 321 255 181 79 340 292 268 198 54
# [35] 390 38 376 47 19 94 244 18 168 201 160 194 114 247 287 273 182
# [52] 87 94 87 192 63 160 244 101 298 62
are the corresponding row numbers in B. The trick was to switch the coordinates of the points in B by using B[, 2:1].
Solution with unique results
out <- vector("numeric", length = kA)
colnames(D) <- 1:ncol(D)
rownames(D) <- 1:nrow(D)
while(any(out == 0))
for(i in 1:nrow(D)) {
aux <- apply(D, 2, which.min)
if(i %in% aux) {
win <- which(aux == i)[which.min(D[i, aux == i])]
out[as.numeric(names(win))] <- as.numeric(rownames(D)[i])
D <- D[-i, -win, drop = FALSE]
}
}
out
# [1] 268 183 350 284 21 360 132 213 100 298 22 56 170 70 128 305 353
# [18] 43 266 198 58 215 294 389 412 321 255 181 79 340 292 20 347 54
# [35] 390 38 376 47 19 94 73 18 168 201 160 194 114 247 287 273 182
# [52] 87 365 158 192 63 211 244 101 68 62
whereas
all(table(res) == 1)
# [1] TRUE
confirms uniqueness. The solution is not the most efficient, but on your dataset it takes only a couple of seconds. It takes some time because it keeps going over all the available points in B checking if it is the closest one to any of the points in A. If so, the corresponding point in B is assigned to the closest one in A. Then both the point in A and the point in B are eliminated from the distance matrix. The loop goes until every point in A has some match in B.
I have a matrix that consists of two columns and a number (n) of rows, while each row represents a point with the coordinates x and y (the two columns).
This is what it looks (LINK):
V1 V2
146 17
151 19
153 24
156 30
158 36
163 39
168 42
173 44
...
now, I would like to use a subset of three consecutive points starting from 1 to do some fitting, save the values from this fit in another list, an den go on to the next 3 points, and the next three, ... till the list is finished. Something like this:
Data_Fit_Kasa_1 <- CircleFitByKasa(Data[1:3,])
Data_Fit_Kasa_2 <- CircleFitByKasa(Data[3:6,])
....
Data_Fit_Kasa_n <- CircleFitByKasa(Data[i:i+2,])
I have tried to construct a loop, but I can't make it work. R either tells me that there's an "unexpected '}' in "}" " or that the "subscript is out of bonds". This is what I've tried:
minimal runnable code
install.packages("conicfit")
library(conicfit)
CFKasa <- NULL
Data.Fit <- NULL
for (i in 1:length(Data)) {
row <- Data[i:(i+2),]
CFKasa <- CircleFitByKasa(row)
Data.Fit[i] <- CFKasa[3]
}
RStudio Version 0.99.902 – © 2009-2016 RStudio, Inc.; Win10 Edu.
The third element of the fitted circle (CFKasa[3]) represents the radius, which is what I am really interested in. I am really stuck here, please help.
Many thanks in advance!
Best, David
Turn your data into a 3D array and use apply:
DF <- read.table(text = "V1 V2
146 17
151 19
153 24
156 30
158 36
163 39", header = TRUE)
a <- t(DF)
dim(a) <-c(nrow(a), 3, ncol(a) / 3)
a <- aperm(a, c(2, 1, 3))
# , , 1
#
# [,1] [,2]
# [1,] 146 17
# [2,] 151 19
# [3,] 153 24
#
# , , 2
#
# [,1] [,2]
# [1,] 156 30
# [2,] 158 36
# [3,] 163 39
center <- function(m) c(mean(m[,1]), mean(m[,2]))
t(apply(a, 3, center))
# [,1] [,2]
#[1,] 150 20
#[2,] 159 35
center(DF[1:3,])
#[1] 150 20
I stumble upon the following thing. I read the reshape manual, but still lost.
Is there an efficient and more elegant way to reshape the matrix of even chunks?
the code to generate the matrix and reshaped matrix is below.
# current matrix
x <- matrix(sample(20*9), 20, 9)
colnames(x) <- c(paste("time",c(1:3),sep="_"),
paste("SGNL", 1, c(1:3), sep="_"),
paste("SGNL", 2, c(1:3), sep="_"))
# reshaped matrix
x.reshaped <- rbind( x[,c(1,4,7)], x[,c(2,5,8)], x[,c(3,6,9)] )
colnames(x.reshaped) <- sub("\\_1$", "", colnames(x.reshaped))
Thanks!
If you want to use an approach that is name-based and not position-based, then you should look at melt from "data.table":
library(data.table)
melt(as.data.table(x), measure.vars = patterns("time", "SGNL_1", "SGNL_2"))
Example output:
head(melt(as.data.table(x), measure.vars = patterns("time", "SGNL_1", "SGNL_2")))
# variable value1 value2 value3
# 1: 1 48 110 155
# 2: 1 67 35 140
# 3: 1 102 55 72
# 4: 1 161 39 66
# 5: 1 36 137 99
# 6: 1 158 169 85
Or, in base R:
patts <- c("time", "SGNL_1", "SGNL_2")
sapply(patts, function(y) c(x[, grep(y, colnames(x))]))
# time SGNL_1 SGNL_2
# [1,] 48 110 155
# [2,] 67 35 140
# [3,] 102 55 72
# [4,] 161 39 66
# [5,] 36 137 99
# .
# .
# .
# .
# [56,] 13 1 84
# [57,] 40 46 95
# [58,] 152 7 178
# [59,] 81 79 123
# [60,] 50 101 146
Data generated with set.seed(1).
We could create the subset of matrices (based on the index generated by the seq) in a list and then rbind it together.
do.call(rbind, lapply(1:3, function(i) x[,seq(i, length.out=3, by=3)]))
Or using a for loop
m2 <- c()
for(i in 1:3) { m2 <- rbind(m2, x[,seq(i, length.out=3, by=3)])}
x[,c(matrix(1:9, 3, byrow=TRUE))] # or shorter:
x[,matrix(1:9, 3, byrow=TRUE)]
I have an integer vector a
a=function(l) as.integer(runif(l,1,600))
a(100)
[1] 414 476 6 58 74 76 45 359 482 340 103 575 494 323 74 347 157 503 385 518 547 192 149 222 152 67 497 588 388 140 457 429 353
[34] 484 91 310 394 122 302 158 405 43 300 439 173 375 218 357 98 196 260 588 499 230 22 369 36 291 221 358 296 206 96 439 423 281
[67] 581 127 178 330 403 91 297 341 280 164 442 114 234 36 257 307 320 307 222 53 327 394 467 480 323 97 109 564 258 2 355 253 596
[100] 215
and an integer matrix B
B=function(c) matrix(as.integer(runif(5*c,1,600)),nrow=5)
B(10)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 250 411 181 345 4 519 167 395 130 388
[2,] 383 377 555 304 119 317 586 351 136 528
[3,] 238 262 513 476 579 145 461 191 262 302
[4,] 428 467 217 590 50 171 450 189 140 158
[5,] 178 14 31 148 285 365 515 64 166 584
and I would like to make a new boolean l x c matrix that shows whether or not each vector element in a is present in each specific column of matrix B.
I tried this with
ispresent1 = function (a,B) {
out = outer(a, B, FUN = "==" )
apply(out,c(1,3),FUN="any") }
or with
ispresent2 = function (a,B) t(sapply(1:length(a), function(i) apply(B,2,function(x) a[[i]] %in% x)))
but neither of these ways to do this are very fast:
a1=a(1000)
B1=B(20000)
system.time(ispresent1(a1,B1))
user system elapsed
76.63 1.08 77.84
system.time(ispresent2(a1,B1))
user system elapsed
218.10 0.00 230.00
(in my application matrix B would have about 500 000 - 2 million columns)
Probably this is something trivial, but what is the proper way to do this?
EDIT: proper syntax, as mentioned below, is ispresent = function (a,B) apply(B,2,function(x) { a %in% x } ), but the Rcpp solution below is still almost 2 times faster! Thanks for this!
After digging a little, and by curiosity about the Rcpp answer of #Backlin I did write a benchmark of orignal solution and our two solutions:
I had to change a little Backlin's function as inline didn't work on my windows box (sorry if I missed something with it, let me know if there's something to adapt)
Code used:
set.seed(123) # Fix the generator
a=function(l) as.integer(runif(l,1,600))
B=function(c) matrix(as.integer(runif(5*c,1,600)),nrow=5)
ispresent1 = function (a,B) {
out = outer(a, B, FUN = "==" )
apply(out,c(1,3),FUN="any") }
a1=a(1000)
B1=B(20000)
tensibai <- function(v,m) {
apply(m,2,function(x) { v %in% x })
}
library(Rcpp)
cppFunction("LogicalMatrix backlin(IntegerVector a,IntegerMatrix B) {
IntegerVector av(a);
IntegerMatrix Bm(B);
int i,j,k;
LogicalMatrix out(av.size(), Bm.ncol());
for(i = 0; i < av.size(); i++){
for(j = 0; j < Bm.ncol(); j++){
for(k = 0; k < Bm.nrow() && av[i] != Bm(k, j); k++);
if(k < Bm.nrow()) out(i, j) = true;
}
}
return(out);
}")
Validation:
> identical(ispresent1(a1,B1),tensibai(a1,B1))
[1] TRUE
> identical(ispresent1(a1,B1),backlin(a1,B1))
[1] TRUE
Benchmark:
> library(microbenchmark)
> microbenchmark(ispresent1(a1,B1),tensibai(a1,B1),backlin(a1,B1),times=3)
Unit: milliseconds
expr min lq mean median uq max neval
ispresent1(a1, B1) 36358.4633 36683.0932 37312.0568 37007.7231 37788.8536 38569.9840 3
tensibai(a1, B1) 603.6323 645.7884 802.0970 687.9445 901.3294 1114.7144 3
backlin(a1, B1) 471.5052 506.2873 528.3476 541.0694 556.7689 572.4684 3
Backlin's solution is slightly faster, proving again Rcpp is a good choice if you know cpp at first :)
Rcpp is awesome for problems like this. It is quite possible that there is some way to do it with data.table or with an existing function, but with the inline package it takes almost less time to write it yourself than to find out.
require(inline)
ispresent.cpp <- cxxfunction(signature(a="integer", B="integer"),
plugin="Rcpp", body='
IntegerVector av(a);
IntegerMatrix Bm(B);
int i,j,k;
LogicalMatrix out(av.size(), Bm.ncol());
for(i = 0; i < av.size(); i++){
for(j = 0; j < Bm.ncol(); j++){
for(k = 0; k < Bm.nrow() && av[i] != Bm(k, j); k++);
if(k < Bm.nrow()) out(i, j) = true;
}
}
return(out);
')
set.seed(123)
a1 <- a(1000)
B1 <- B(20000)
system.time(res.cpp <- ispresent.cpp(a1, B1))
user system elapsed
0.442 0.005 0.446
res1 <- ispresent1(a1,B1)
identical(res1, res.cpp)
[1] TRUE
a=function(l) as.integer(runif(l,1,600))
B=function(c) matrix(as.integer(runif(5*c,1,600)),nrow=5)
ispresent1 = function (a,B) {
out = outer(a, B, FUN = "==" )
apply(out,c(1,3),FUN="any") }
ispresent2 = function (a,B) t(sapply(1:length(a), function(i) apply(B,2,function(x) a[[i]] %in% x)))
ispresent3<-function(a,B){
tf<-matrix((B %in% a),nrow=5)
sapply(1:ncol(tf),function(x) a %in% B[,x][tf[,x]])
}
a1=a(1000)
B1=B(20000)
> system.time(ispresent1(a1,B1))
user system elapsed
29.91 0.48 30.44
> system.time(ispresent2(a1,B1))
user system elapsed
89.65 0.15 89.83
> system.time(ispresent3(a1,B1))
user system elapsed
0.83 0.00 0.86
res1<-ispresent1(a1,B1)
res3<-ispresent3(a1,B1)
> identical(res1,res3)
[1] TRUE
I would like to do 10 fold cross validation and then using MSE for model selection in R . I can divide the data into 10 groups, but I got the following error, how can I fix it?
crossvalind <- function(N, kfold) {
len.seg <- ceiling(N/kfold)
incomplete <- kfold*len.seg - N
complete <- kfold - incomplete
ind <- matrix(c(sample(1:N), rep(NA, incomplete)), nrow = len.seg, byrow = TRUE)
cvi <- lapply(as.data.frame(ind), function(x) c(na.omit(x))) # a list
return(cvi)
}
I am using logspline package for estimation of a density function.
library(logspline)
x = rnorm(300, 0, 1)
kfold <- 10
cvi <- crossvalind(N = 300, kfold = 10)
for (i in 1:length(cvi)) {
xc <- x[cvi[-i]] # x in training set
xt <- x[cvi[i]] # x in test set
fit <- logspline(xc)
f.pred <- dlogspline(xt, fit)
f.true <- dnorm(xt, 0, 1)
mse[i] <- mean((f.true - f.pred)^2)
}
Error in x[cvi[-i]] : invalid subscript type 'list'
cvi is a list object, so cvi[-1] and cvi[1] are list objects, and then you try and get x[cvi[-1]] which is subscripting using a list object, which doesn't make sense because list objects can be complex objects containing numbers, characters, dates and other lists.
Subscripting a list with single square brackets always returns a list. Use double square brackets to get the constituents, which in this case are vectors.
> cvi[1] # this is a list with one element
$V1
[1] 101 78 231 82 211 239 20 201 294 276 181 168 207 240 61 72 267 75 218
[20] 177 127 228 29 159 185 118 296 67 41 187
> cvi[[1]] # a length 30 vector:
[1] 101 78 231 82 211 239 20 201 294 276 181 168 207 240 61 72 267 75 218
[20] 177 127 228 29 159 185 118 296 67 41 187
so you can then get those elements of x:
> x[cvi[[1]]]
[1] 0.32751014 -1.13362827 -0.13286966 0.47774044 -0.63942372 0.37453378
[7] -1.09954301 -0.52806368 -0.27923480 -0.43530831 1.09462984 0.38454106
[13] -0.68283862 -1.23407793 1.60511404 0.93178122 0.47314510 -0.68034783
[19] 2.13496564 1.20117869 -0.44558321 -0.94099782 -0.19366673 0.26640705
[25] -0.96841548 -1.03443796 1.24849113 0.09258465 -0.32922472 0.83169736
this doesn't work with negative indexes:
> cvi[[-1]]
Error in cvi[[-1]] : attempt to select more than one element
So instead of subscripting x with the list elements you don't want, subscript it with the negative of the indexes you do want (since you are partitioning here):
> x[-cvi[[1]]]
will return the other 270 elements. Note I've used 1 here for the first pass through your loop, replace with i and insert in your code.