I have two lengthy data sets with several columns and different lengths, for this example lets subset to few rows and just 3 columns:
Temp <- c(12.9423 ,12.9446 ,12.9412 ,12.9617 ,12.9742 ,12.9652 ,12.9463, 12.9847 ,12.9778,
12.9589, 12.9305, 12.9275 ,12.8569 ,12.8531 ,12.9092, 12.9471, 12.9298, 12.9266,
12.9374 ,12.9385, 12.9505, 12.9510, 12.9632 ,12.9621 ,12.9571, 12.9492 ,12.8988,
12.8895 ,12.8777, 12.8956, 12.8748 ,12.7850 ,12.7323, 12.7546 ,12.7375 ,12.7020,
12.7172, 12.7015, 12.6960, 12.6944, 12.6963, 12.6928, 12.6930 ,12.6883 ,12.6913)
Density <- c(26.38635 ,26.38531 ,26.38429, 26.38336, 26.38268 ,26.38242, 26.38265, 26.38343,
26.38486, 26.38697 ,26.38945, 26.39188, 26.39365, 26.39424 ,26.39376 ,26.39250,
26.39084 ,26.38912 ,26.38744 ,26.38587, 26.38456 ,26.38367, 26.38341 ,26.38398,
26.38547 ,26.38793 ,26.39120 ,26.39509, 26.39955 ,26.40455, 26.41002, 26.41578,
26.42126, 26.42593 ,26.42968, 26.43255 ,26.43463, 26.43603 ,26.43693 ,26.43750,
26.43787, 26.43815, 26.43841 ,26.43871 ,26.43904)
po4 <- c(0.4239840 ,0.4351156, 0.4456128, 0.4542392, 0.4608510, 0.4656445, 0.4690847,
0.4717291, 0.4742391 ,0.4774904 ,0.4831152, 0.4922122, 0.5029904, 0.5128720,
0.5190209, 0.5191368 ,0.5133212, 0.5027542 ,0.4905301 ,0.4796467 ,0.4708035,
0.4638879, 0.4578364 ,0.4519745, 0.4481336, 0.4483697, 0.4531310, 0.4622930,
0.4750474 ,0.4905152 ,0.5082183 ,0.5278212 ,0.5491580 ,0.5720519, 0.5961127,
0.6207716 ,0.6449603, 0.6675704 ,0.6878331 ,0.7051851,0.7195461, 0.7305200,
0.7359634 ,0.7343541, 0.7283988)
PP14 <- data.frame(Temp,Density,po4) ##df1
temp <- c(13.13875, 13.13477 ,13.12337 ,13.10662 ,13.09798 ,13.09542 ,13.08734 ,13.07616,
13.06671 ,13.05899, 13.05890 ,13.05293 ,13.03322, 13.01515, 13.02552 ,13.01668,
12.99829, 12.97075 ,12.95572 ,12.95045 ,12.94541 ,12.94365 ,12.94609 ,12.94256,
12.93565 ,12.93258 ,12.93489 ,12.93209 ,12.92219 ,12.90730 ,12.90416 ,12.89974,
12.89749 ,12.89626 ,12.89395, 12.89315 ,12.89274, 12.89276 ,12.89293 ,12.89302)
density <- c( 26.35897, 26.36274 ,26.36173 ,26.36401 ,26.36507 ,26.36662 ,26.36838,
26.36996,
26.37286 ,26.37452 ,26.37402, 26.37571 ,26.37776, 26.38008 ,26.37959 ,26.38178,
26.38642 ,26.39158 ,26.39350, 26.39467, 26.39601, 26.39601, 26.39596 ,26.39517,
26.39728 ,26.39766, 26.39774, 26.39699 ,26.40081 ,26.40328 ,26.40416, 26.40486,
26.40513 ,26.40474 ,26.40552 ,26.40584, 26.40613, 26.40602 ,26.40595 ,26.40498)
krho <- c( -9.999999e+06, -1.786843e+00, -9.142976e-01, -9.650734e-01, -2.532397e+00,
-3.760537e+00, -2.622484e+00, -1.776506e+00, -2.028391e+00, -2.225910e+00,
-3.486826e+00, -2.062341e-01, -3.010643e+00, -3.878437e+00, -3.796426e+00,
-3.227138e+00, -3.335446e+00, -3.738037e+00, -4.577778e+00, -3.818099e+00,
-3.891467e+00, -4.585045e+00 ,-3.150283e+00 ,-4.371089e+00 ,-3.902601e+00,
-4.546019e+00, -3.932538e+00, -4.331247e+00, -4.508137e+00, -4.789201e+00,
-4.383820e+00, -4.423486e+00, -4.334641e+00, -4.330544e+00, -4.838604e+00,
-4.729123e+00, -4.381797e+00, -4.207365e+00, -4.276804e+00, -4.001305e+00)
MS14 <- data.frame(temp,density,krho) ##df2
So now I would like to loop through both data sets and check if MS14$density=PP14$Density if it is true then I would like to use the column krho in that row to multiply it by delta po4 that corresponds to the same density so diff(po4) in that row or range. something like
#MS14$krho[i] * diff(PP14$po4)[i]
BUT when I run
PP14$Density == MS14$density
of course it is always FALSE, because the large decimal numbers, none is exactly the same. I solved that by round the numbers to the 3rd decimal, but it should be a way to include that in the code so density +- 0.005 for example. Well or just rounding it to the 3rd decimal like:
PP14$Density_round2 <- round(PP14$Density ,digit=2)
In any case I am not sure if I should use a nested loop to check both columns and make the operations accordingly or if it would be better to create a new data.frame with the intersect of each data.frame:
common <- intersect(PP14$Density, MS14$density)
and then make calculations....(??)
So I would probably need a nested loop like:
{for i:PP14
for j:MS14
new-> PP14$Density[i] == MS14$density[j]
#if new is true then PP14$krho[i]* MS14$diff(po4)[j]#[for that particular row]
#and print it into a new data.frame df3
#}
So please, feel free to suggest the best way to proceed.. there might be several ways to do it..
Thank you so much in advance!!
Ps: suggestions using Matlab are also welcome
Something like this?
compareDec <- function(x, y, digits = NULL, tol = .Machine$double.eps^0.5){
if(is.null(digits)){
abs(x - y) < tol
} else {
round(x, digits = digits) == round(y, digits = digits)
}
}
icomp <- outer(MS14$density, PP14$Density, compareDec, digits = 2)
m <- outer(MS14$krho, c(0, diff(PP14$po4)))
new <- which(icomp, arr.ind = TRUE)
df3 <- cbind.data.frame(new, Prod = m[new])
head(df3)
# row col Prod
#1 17 1 0.00000000
#2 18 1 0.00000000
#3 19 1 0.00000000
#4 20 1 0.00000000
#5 17 2 -0.03712885
#6 18 2 -0.04161033
I need to replace 5% of the values in my matrix with the number 0.2 (guess in my code) IF they are below 0.2, leave them alone if they are above 0.2
Right now my code is changing all values less than 0.2 to 0.2.
This will be in a larger loop eventually to occur over multiple replications, but right now I am trying to get it to work for just 1.
Explanation:
gen.probs.2PLM is a matrix containing probabilities. Guess is the value I have chosen to replace others. Perc is the percentage I would like to look at in the matrix and change IF it is less than guess.
gen.probs.2PLM <- icc.2pl(gen.theta,a,b,N,TL)
perc<-0.05*N
guess<-0.2
gen.probs.2PLM[gen.probs.2PLM < guess] <- guess
I expect only 5 percent of the values to be looked at and changed to 0.2 if they are below 0.2
gen.probs.2PLM is a matrix that is 1000*45
# dput(gen.probs.2PLM[1:20, 1:5])
structure(c(0.940298707380962, 0.848432615784556, 0.927423909103331,
0.850853479678874, 0.857217846940203, 0.437981231531586, 0.876146933879543,
0.735970164547576, 0.76296469377238, 0.640645338681073, 0.980212105400924,
0.45164925578322, 0.890102475061895, 0.593094353657132, 0.837401449711248,
0.867436194744775, 0.753637051722629, 0.64254277457268, 0.947783594375454,
0.956791049998361, 0.966059152820211, 0.896715435704569, 0.957247808046098,
0.898712615329071, 0.903924224222216, 0.474561641407715, 0.919080521405463,
0.795919510255144, 0.821437921281395, 0.700141602452725, 0.990657455188518,
0.490423165094245, 0.92990761183835, 0.649494291971471, 0.887513826127176,
0.912171225584296, 0.812707696992244, 0.702126169775785, 0.971012049724468,
0.976789027046465, 0.905046450670641, 0.81322870291296, 0.890539069545935,
0.81539882951241, 0.821148949083641, 0.494459368656066, 0.838675666691869,
0.719720365120414, 0.741166345529595, 0.646700411799437, 0.9578080044146,
0.504938867664858, 0.852068230044858, 0.611124165649146, 0.803451686558428,
0.830526582119632, 0.73370297276145, 0.648126933954648, 0.913887754151632,
0.925022099584059, 0.875712266966582, 0.762677615526032, 0.857390771477182,
0.765270669721981, 0.772159371696644, 0.418524844618452, 0.793318641931831,
0.65437308255825, 0.678633290218262, 0.574232080921638, 0.943851827968259,
0.428780249640693, 0.809653131485398, 0.536512513508941, 0.751041035436293,
0.783450103818893, 0.6701523432789, 0.575762279897951, 0.886965071394186,
0.901230746880145, 0.868181123535613, 0.688344765218149, 0.840795870494126,
0.69262216320168, 0.703982665712434, 0.215843106547112, 0.738775789107177,
0.513997187757334, 0.551803060188986, 0.397460216626274, 0.956693337996693,
0.225901690507801, 0.765409027208693, 0.347791079152411, 0.669156131912199,
0.72257632593578, 0.538474414984722, 0.399549159711904, 0.884405290470079,
0.904200878248468), .Dim = c(20L, 5L))
Here is a function that you can apply to a numeric matrix to replace 5% of the values below some threshold (e.g. .2 in your case) with the threshold:
replace_5pct <- function(d, threshold=.2){
# get indices of cells below threshold, sample 5% of them
cells_below <- which(d < threshold)
cells_to_modify <- sample(cells_below, size=.05*length(cells_below))
# then replace values for sampled indices with threshold + return
d[cells_to_modify] <- threshold
return(d)
}
Here's an example of how it can be used (where dat would correspond to your matrix):
dat <- matrix(round(runif(1000), 1), ncol=10)
dat_5pct_replaced <- replace_5pct(dat, threshold=.2)
You can look at the data to confirm the result, or look at stats like these:
mean(dat < .2) # somewhere between .1 and .2 probably
sum(dat != dat_5pct_replaced) # about 5% of mean(dat < .2)
p.s.: If you want to generalize the function, you could abstract over the 5% replacement too -- then you could replace e.g. 10% of values below some threshold, etc. And if you wanna get fancy you could abstract over "less than" too, and add a comparison function as a parameter to the main function.
replace_func <- function(d, func, threshold, prop){
cells <- which(func(d, threshold))
cells_to_modify <- sample(cells, size=prop*length(cells))
d[cells_to_modify] <- threshold
return(d)
}
And then e.g. replace 10% of values above .5 with .5:
# (need to backtick infix functions like <, >, etc.)
replace_func(dat, func=`>`, threshold=.5, prop=.1)
I am trying to determine the distance between everypoint in one data set vs the other data set in R. Each data set has an X and Y parameter. I have been converting the data sets into data frames and the finding the distance. However my current code creates a large matrix to due this listing both the data sets as columns and rows. I then need to identify a specific part of the matrix I care about to get my answers, Is there a way just to put DSA as the columns and DSB as the rows. this whould cut the matrix in 1/4 which since my data sets contain thousands of points each whould really cut down the time for the algorithum to run
Here is the code I am using
tumor<-data.frame(DSA[,c ("X_Parameter","Y_Parameter")])
cells<-data.frame(DSB[,c ("X_Parameter","Y_Parameter")])
distances<-as.matrix(dist(rbind(tumor,cells)))
row.start<-nrow(tumor)+1
row.end<-nrow(tumor)+nrow(cells)
col.start<-1
col.end<-nrow(tumor)
distances[row.start:row.end, col.start:col.end]
d<- distances[row.start:row.end, col.start:col.end]
Try flexclust::dist2:
n_tumor = 2000
n_cells = 2000
tumor = matrix(runif(n_tumor * 2), n_tumor, )
cells = matrix(runif(n_cells * 2), n_cells, )
t_dist = system.time({
distances<-as.matrix(dist(rbind(tumor,cells)))
row.start<-nrow(tumor)+1
row.end<-nrow(tumor)+nrow(cells)
col.start<-1
col.end<-nrow(tumor)
d <- distances[row.start:row.end, col.start:col.end]
})[3]
require(flexclust)
t_dist2 = system.time({d2 = dist2(x = cells, y = tumor, method = "euclidean")})[3]
t_dist # 1.477
t_dist2 # 0.244
identical(unname(d), d2) # TRUE
EDIT:
Another alternative is proxy::dist.
This will compute only the portion of the matrix you need:
tumoridx <- rep(1:nrow(tumor), each=nrow(cells)
cellsidx <- rep(1:nrow(cells), nrow(tumor))
tcdist <- matrix(sqrt(rowSums((tumor[tumoridx, ] - cells[cellsidx, ])^2)),
nrow(cells), nrow(tumor))
I have an existing data frame with a variable "grade" indicating the type of row/observation. My goal is to select from another dataframe more of these types of rows while not exceeding a maximum percentage for each grade type in my existing data frame. I have defined a named vector with the grade allocations:
gradeAllocation <- c("A" = 0, "B" = 0, "C" = .25, "D" = .40, "E" = .20, "F" = .10, "G" = .05)
This represents the maximum percent of each type of grade in my data frame. Now, lets say I want to select from another data frame a mixture of grades but I dont want to select too many where after the selection would give me more than the max percentage per grade type. I would be basically doing this process in a loop for each new data set that becomes available but want to keep the max distribution given by the gradeAllocation vector.
Is there a package/function that can help here? Any thoughts for custom code?
Thanks, John
So as #Mr.Flick points out, there is no guarantee that this will be possible. In your gradeAllocation the sampling distribution sums to 1. If your test dataset has no "D", for example, it will not be possible to create a sample with at most 25% C, 15% E, 10% F, 5% G, and no A or B.
Also, because the sampling distribution sums to 1, if the sample size you want is N, then the number of samples of each grade must be given by N * gradeAllocation. Here is a method that takes advantage of that fact, starting with a dataset that has 700 samples and is uniformly distributed (same number in each grade), and we extract a random sample of 100 with the distribution given by gradeAllocation.
# sample dataset: 700 observations, grade distribution is uniform
set.seed(1) # for reproducible example
data <- data.frame(grade=rep(LETTERS[1:7],each=100),x=rnorm(700))
# desired distribution in the sample
gradeAllocation <- c(A=0, B=0, C=.25, D=.40, E=.20, F=.10, G=.05)
# you start here...
N <- 100 # sample size
get.sample<- function(g) data[sample(which(data$grade==g),N*gradeAllocation[g]),]
result <- do.call(rbind,lapply(LETTERS[1:7],get.sample))
# confirm distribution of grades in the sample
table(result$grade)
# A B C D E F G
# 0 0 25 40 20 10 5
Here's one approach
Generate some data
nOriginal <- 1000
df1 <- data.frame(grade=sample(c('A','B','C','D','E','F','G'),1000,replace=TRUE),
indx=seq(1:nOriginal))
Get the rows that correspond to each grade
idx_a=which(df1$grade=='A')
idx_b=which(df1$grade=='B')
idx_c=which(df1$grade=='C')
idx_d=which(df1$grade=='D')
idx_e=which(df1$grade=='E')
idx_f=which(df1$grade=='F')
idx_g=which(df1$grade=='G')
Sample the rows based on the prescribed distribution which should sum to one.
location <- c("A" = 0, "B" = 0, "C" = .25, "D" = .40, "E" = .20, "F" = .10, "G" = .05)
nSamples = 200
samp_idx_a <- sample(idx_a,nSamples*location["A"])
samp_idx_b <- sample(idx_b,nSamples*location["B"])
samp_idx_c <- sample(idx_c,nSamples*location["C"])
samp_idx_d <- sample(idx_d,nSamples*location["D"])
samp_idx_e <- sample(idx_e,nSamples*location["E"])
samp_idx_f <- sample(idx_f,nSamples*location["F"])
samp_idx_g <- sample(idx_g,nSamples*location["G"])
df_2 <- df1[c(samp_idx_a, samp_idx_b, samp_idx_c, samp_idx_d,
samp_idx_e, samp_idx_f, samp_idx_g),]
Check the results
(percent_A = sum(df_2$grade=="A")/nrow(df_2)*100)
(percent_B = sum(df_2$grade=="B")/nrow(df_2)*100)
(percent_C = sum(df_2$grade=="C")/nrow(df_2)*100)
(percent_D = sum(df_2$grade=="D")/nrow(df_2)*100)
(percent_E = sum(df_2$grade=="E")/nrow(df_2)*100)
(percent_F = sum(df_2$grade=="F")/nrow(df_2)*100)
(percent_G = sum(df_2$grade=="G")/nrow(df_2)*100)