Related
Is there a way to extract just the estimates from nlsList()?
Sample data:
library(nlme)
dat<-read.table(text="time gluc starch solka
1 6.32 7.51 1.95
2 20.11 25.49 6.43
3 36.03 47.53 10.39
6 107.52 166.31 27.01
12 259.28 305.19 113.72
24 283.40 342.56 251.14
48 297.55 353.66 314.22", header = TRUE)
long <- tidyr::pivot_longer(dat, -1, values_to = "y")
long$name <- factor(long$name)
st0 <- list(Max = 200, k = 0.1, Lag = 0.5)
kinetics<-nlsList(y ~ (time > Lag) * Max * (1-exp(-k * (time - Lag))) | name, long, start = st0)
I would like to end up with a data frame like the image below of the samples and their estimates for Max, k, and Lag but cannot figure out how.
We could extract the coef and then loop over the 3d array with apply
library(nlme)
m1 <- apply(summary(kinetics)$coef, 3, function(x) x[,1])
dat <- transform(as.data.frame(m1), name = row.names(m1))[c(4, 1:3)]
row.names(dat) <- NULL
-output
dat
name Max k Lag
1 gluc 299.6637 0.16155846 2.426204
2 solka 337.5416 0.06583197 4.966971
3 starch 353.7206 0.18416048 2.276593
Here is a simple way, just coerce the coefficients appropriate dimension to class "data.frame".
cf_smry <- coef(summary(kinetics))[, 1, ]
as.data.frame(cf_smry)
# Max k Lag
#gluc 299.6637 0.16155846 2.426204
#solka 337.5416 0.06583197 4.966971
#starch 353.7206 0.18416048 2.276593
coef(kinetics) gives a data frame so any of these would work and differ only in whether the names appear as row names (first one) or as a column (others).
coef(kinetics)
data.frame(name = names(kinetics), coef(kinetics), row.names = NULL)
tibble::rownames_to_column(coef(kinetics), var = "name")
Here part of the data set:
w=structure(list(price = c(6250L, 9860L, 14690L, 34350L, 4480L,
23230L, 14785L, 14785L, 6270L, 11530L, 26080L, 208333L, 42920L,
10080L, 45500L, 5798L, 18400L, 5285L)), class = "data.frame", row.names = c(NA,
-18L))
I want to find values which are outside 3 sigma (-3; 3)
and when any values are outside of 3 sigma, replace it with the median.
sapply(w, function(x) replace(x, is.na(x), (x, na.rm=TRUE)))
is not working for me.
How can I find any values outside of 3 sigma and replace it with the median?
Maybe you can try
within(
w,
price <- replace(
price,
abs(mean(price) - price) > 3 * sd(price),
median(price)
)
)
such that
price
1 6250.0
2 9860.0
3 14690.0
4 34350.0
5 4480.0
6 23230.0
7 14785.0
8 14785.0
9 6270.0
10 11530.0
11 26080.0
12 14737.5
13 42920.0
14 10080.0
15 45500.0
16 5798.0
17 18400.0
18 5285.0
p <- w$price
m <- median(p)
replace(p, p > m + 3*sd(p) | p < m-3*sd(p), m)
You can use:
med <- median(w$price, na.rm = TRUE)
w$price[w$price < mean(w$price, na.rm = TRUE) - 3*sd(w$price) | w$price > mean(w$price, na.rm = TRUE) + 3*sd(w$price)] <- med
w
I want to calculate normalised ratios in all possible combinations efficiently for a large matrix in R. I have asked a similar question earlier here and with a small data and the solutions provided there worked fine. But when I am trying to apply the same solution for a large dataset (400 x 2151), my system is getting hang. My system is having 16 GB RAM with Intel i7 processer. Here is the code with data
df <- matrix(rexp(860400), nrow = 400, ncol = 2151)
Solution provided by #Ronak Shah
cols <- 1:ncol(df)
temp <- expand.grid(cols, cols)
new_data <- (df[,temp[,2]] - df[,temp[,1]])/(df[,temp[,2]] + df[,temp[,1]])
Or the following solution as provided by #akrun
f1 <- function(i, j) (df[, i] - df[, j])/(df[, i] + df[, j])
out <- outer(seq_along(df), seq_along(df), FUN = f1)
colnames(out) <- outer(names(df), names(df), paste, sep = "_")
Both the solutions taking a very long time and the system is getting hang. So, how can I efficiently do it?
Update
Upadate on expected output
library(tidyverse)
#Fake dataset
df = structure(list(var_1 = c(0.035, 0.047, 0.004, 0.011, 0.01, 0.01, 0.024),
var_2 = c(0.034, 0.047, 0.004, 0.012, 0.01, 0.011, 0.025),
var_3 = c(0.034, 0.047, 0.006, 0.013, 0.011, 0.013, 0.026),
var_4 = c(0.034, 0.046, 0.008, 0.016, 0.014, 0.015, 0.028),
var_5 = c(0.034, 0.046, 0.009, 0.017, 0.015, 0.016, 0.029)),
class = "data.frame", row.names = c(NA, -7L))
cols <- 1:ncol(df)
mat_out <- do.call(cbind, lapply(cols, function(xj)
sapply(cols, function(xi) (df[, xj] - df[, xi])/(df[, xj] + df[, xi]))))
colnames(mat_out) <- outer(names(df), names(df), paste, sep = ",")
y <- read.table(text = "s_no y
1 95.512
2 97.9
3 92.897
4 94.209
5 87.472
6 91.109
7 92.83", header = T)
mat_out %>% as.data.frame() %>%
mutate(id = row_number()) %>%
left_join(y, by = c("id" = "s_no")) %>%
pivot_longer(cols = -c(y, id)) %>%
group_by(name) %>%
mutate(correl = cor(value, y, use = "complete.obs")) %>%
distinct(name, .keep_all = TRUE) %>%
separate(name, c("Wav1", "Wav2"), sep = ",") %>%
select(-c("id", "y", "value")) %>%
pivot_wider(names_from = Wav2, values_from = correl)
#> # A tibble: 5 × 6
#> Wav1 var_1 var_2 var_3 var_4 var_5
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 var_1 NA -0.190 -0.358 -0.537 -0.551
#> 2 var_2 0.190 NA -0.322 -0.528 -0.544
#> 3 var_3 0.358 0.322 NA -0.682 -0.667
#> 4 var_4 0.537 0.528 0.682 NA -0.595
#> 5 var_5 0.551 0.544 0.667 0.595 NA
You can use rcpp to make your code faster:
Rcpp::cppFunction("
std::vector<double> my_fun(arma::mat& x, arma::vec& y){
int p = x.n_cols - 1;
std::vector<double> result;
for(int i = 0; i<p; i++){
auto m = (x.cols(i+1, p).each_col() - x.col(i));
m /= (x.cols(i+1, p).each_col() + x.col(i));
auto a = arma::conv_to<std::vector<double>>::from(arma::cor(m, y));
result.insert(result.end(), a.begin(), a.end());
}
return result;
}", 'RcppArmadillo')
my_fun(df, y) # takes approximately 14seconds.
You could use the STL functions to make it even faster. Though the code will be longer. on my computer this takes 6seconds
mat <- matrix(rexp(860400), nrow = 400, ncol = 2151)
y <- rnorm(nrow(df), 700, 10)
my_fun(mat, y) # This works
Since memory seems to be your main issue, how about using iterators? Using the package RcppAlgos*, we can make use of permuteIter to calculate your ratios N at a time.
If one must have names, we need an additional iterator. This means you must keep 2 iterators in sync, which can become tedious. Fortunately, with the summary() methods of permuteIter, we can always see what the current index is and reset them with a wide range of options (e.g. random access [[, front(), back(), or startOver()).
library(RcppAlgos)
df <- matrix(rexp(860400), nrow = 400, ncol = 2151)
ratioIter <- permuteIter(ncol(df), 2, FUN = function(x) {
(df[, x[2]] - df[, x[1]]) / (df[, x[2]] + df[, x[1]])
})
## if you really want to name your output, you must have
## an additional name iterator... not very elegant
nameIter <- permuteIter(paste0("col", 1:ncol(df1)), 2, FUN = function(x) {
paste0(rev(x), collapse = "_")
})
firstIter <- matrix(ratioIter$nextIter(), ncol = 1)
firstName <- nameIter$nextIter()
colnames(firstIter) <- firstName
head(firstIter)
col2_col1
[1,] 0.2990054
[2,] -0.9808111
[3,] -0.9041054
[4,] 0.7970873
[5,] 0.8625776
[6,] 0.2768359
## returns a list, so we call do.call(cbind
next5Iter <- do.call(cbind, ratioIter$nextNIter(5))
next5Names <- unlist(nameIter$nextNIter(5))
colnames(next5Iter) <- next5Names
head(next5Iter)
col3_col1 col4_col1 col5_col1 col6_col1 col7_col1
[1,] -0.28099710 0.1665687 0.40565958 -0.7524038 -0.7132844
[2,] -0.81434900 -0.4283759 -0.89811556 -0.8462906 -0.5399741
[3,] -0.02289368 0.4285012 0.05087853 -0.5091659 -0.2328995
[4,] -0.06825458 0.3126928 0.68968843 -0.2180618 0.6651785
[5,] 0.33508319 0.7389108 0.84733425 0.9065263 0.8977107
[6,] 0.61773589 0.3443120 0.61084584 0.5727938 0.3888807
You should note that this does not show results where i == j (these give NaN). So the total number is just under 21512 (In fact it is exactly equal to 2151^2 - 2151).
ratioIter$summary()
$description
[1] "Permutations of 2151 choose 2"
$currentIndex
[1] 6
$totalResults
[1] 4624650
$totalRemaining
[1] 4624644
There are even random access and previous iterators as well:
## Get the last ratio
lastIter <- ratioIter$back()
lastName <- nameIter$back()
mLast <- matrix(lastIter, ncol = 1)
colnames(mLast) <- lastName
head(mLast)
col2150_col2151
[1,] -0.6131926
[2,] 0.9936783
[3,] 0.1373538
[4,] 0.1014347
[5,] -0.5061608
[6,] 0.5773503
## iterate backwards with the previous methods
prev5Iter <- do.call(cbind, ratioIter$prevNIter(5))
prev5Names <- unlist(nameIter$prevNIter(5))
colnames(prev5Iter) <- prev5Names
head(prev5Iter)
col2149_col2151 col2148_col2151 col2147_col2151 col2146_col2151 col2145_col2151
[1,] -0.75500069 -0.72757136 -0.94457988 -0.82858884 -0.25398782
[2,] 0.99696694 0.99674084 0.99778638 0.99826472 0.95738947
[3,] 0.27701596 0.45696010 0.00682574 0.01529448 -0.62368764
[4,] -0.09508689 -0.90698165 -0.38221934 -0.41405984 0.01371556
[5,] -0.31580709 -0.06561386 -0.07435058 -0.08033145 -0.90692881
[6,] 0.82697720 0.86858595 0.81707206 0.75627297 0.46272349
## Get a random sample
set.seed(123)
randomIter <- do.call(cbind, ratioIter[[sample(4624650, 5)]])
## We must reset the seed in order to get the same output for the names
set.seed(123)
randomNames <- unlist(nameIter[[sample(4624650, 5)]])
colnames(randomIter) <- randomNames
head(randomIter)
col1044_col939 col20_col1552 col412_col2014 col1751_col1521 col337_col1295
[1,] -0.3902066 0.4482747 -0.108018200 -0.1662857 -0.3822436
[2,] -0.2358101 0.9266657 -0.657135882 0.0671608 -0.6821823
[3,] -0.7054217 0.8944720 0.092363665 0.2667708 0.1908249
[4,] -0.1574657 0.2775225 -0.221737223 0.3381454 -0.5705021
[5,] -0.4282909 -0.4406433 0.092783086 -0.7506674 -0.1276932
[6,] 0.9998189 -0.2497586 -0.009375891 0.7071864 -0.2425258
Lastly, it is written in C++ so it is very fast:
system.time(ratioIter$nextNIter(1e3))
# user system elapsed
# 0 0 0
* I am the author of RcppAlgos
I am new to R and need to do pairwise comparison formulas across a set of variables. The number of elements to be compared will by dynamic but here is a hardcoded example with 4 elements, each compared against the other:
#there are 4 choices A, B, C, D -
#they are compared against each other and comparisons are stored:
df1 <- data.frame("A" = c(80),"B" = c(20))
df2 <- data.frame("A" = c(90),"C" = c(10))
df3 <- data.frame("A" = c(95), "D" = c(5))
df4 <- data.frame("B" = c(80), "C" = c(20))
df5 <- data.frame("B" = c(90), "D" = c(10))
df6 <- data.frame("C" = c(80), "D" = c(20))
#show the different comparisons in a matrix
matrixA <- matrix(c("", df1$B[1], df2$C[1], df3$D[1],
df1$A[1], "", df4$C[1], df5$D[1],
df2$A[1], df4$B[1], "", df6$D[1],
df3$A[1], df5$B[1], df6$C[1], ""),
nrow=4,ncol = 4,byrow = TRUE)
dimnames(matrixA) = list(c("A","B","C","D"),c("A","B","C","D"))
#perform calculations on the comparisons
matrixB <- matrix(
c(1, df1$B[1]/df1$A[1], df2$C[1]/df2$A[1], df3$D[1]/df3$A[1],
df1$A[1]/df1$B[1], 1, df4$C[1]/df4$B[1], df5$D[1]/df5$B[1],
df2$A[1]/df2$C[1], df4$B[1]/df4$C[1], 1, df6$D[1]/df6$C[1],
df3$A[1]/df3$D[1], df5$B[1]/df5$D[1], df6$C[1]/df6$D[1], 1),
nrow = 4, ncol = 4, byrow = TRUE)
matrixB <- rbind(matrixB, colSums(matrixB)) #add the sum of the colums
dimnames(matrixB) = list(c("A","B","C","D","Sum"),c("A","B","C","D"))
#so some more calculations that I'll use later on
dfC <- data.frame("AB" = c(matrixB["A","A"] / matrixB["A","B"],
matrixB["B","A"] / matrixB["B","B"],
matrixB["C","A"] / matrixB["C","B"],
matrixB["D","A"] / matrixB["D","B"]),
"BC" = c(matrixB["A","B"] / matrixB["A","C"],
matrixB["B","B"] / matrixB["B","C"],
matrixB["C","B"] / matrixB["C","C"],
matrixB["D","B"] / matrixB["D","C"]
),
"CD" = c(matrixB["A","C"] / matrixB["A","D"],
matrixB["B","C"] / matrixB["B","D"],
matrixB["C","C"] / matrixB["C","D"],
matrixB["D","C"] / matrixB["D","D"]))
dfCMeans <- colMeans(dfC)
#create the normalization matrix
matrixN <- matrix(c(
matrixB["A","A"] / matrixB["Sum","A"], matrixB["A","B"] / matrixB["Sum","B"], matrixB["A","C"] / matrixB["Sum","C"], matrixB["A","D"] / matrixB["Sum","D"],
matrixB["B","A"] / matrixB["Sum","A"], matrixB["B","B"] / matrixB["Sum","B"], matrixB["B","C"] / matrixB["Sum","C"], matrixB["B","D"] / matrixB["Sum","D"],
matrixB["C","A"] / matrixB["Sum","A"], matrixB["C","B"] / matrixB["Sum","B"], matrixB["C","C"] / matrixB["Sum","C"], matrixB["C","D"] / matrixB["Sum","D"],
matrixB["D","A"] / matrixB["Sum","A"], matrixB["D","B"] / matrixB["Sum","B"], matrixB["D","C"] / matrixB["Sum","C"], matrixB["D","D"] / matrixB["Sum","D"]
), nrow = 4, ncol = 4, byrow = TRUE)
Since R is so concise it seems like there should be a much better way to do this, I would like to know an easier way to figure out these type of calculations using R.
OK, I might be starting to piece together something here.
We start with a matrix like so:
A <- structure(
c(NA, 20, 10, 5, 80, NA, 20, 10, 90, 80, NA, 20, 95, 90, 80, NA),
.Dim = c(4, 4),
.Dimnames = list(LETTERS[1:4], LETTERS[1:4]))
A
# A B C D
# A NA 80 90 95
# B 20 NA 80 90
# C 10 20 NA 80
# D 5 10 20 NA
This matrix is the result of a pairwise comparison on a vector of length 4. We know nothing of this vector, and the only thing we know about the function used in the comparison is that it is binary non-commutative, or more precisely: f(x, y) = 100 - f(y, x) and the result is ∈ [0, 100].
matrixB appears to be simply matrixA divided by its own transpose:
B = ATA-1
or if you prefer:
B = (100 - A) / A
Potato patato due to above mentioned properties.
B <- (100 - A) / A
B <- t(A) / A
# fill in the diagonal with 1s
diag(B) <- 1
round(B, 2)
# A B C D
# A 1 0.25 0.11 0.05
# B 4 1.00 0.25 0.11
# C 9 4.00 1.00 0.25
# D 19 9.00 4.00 1.00
The 'normalized' matrix as you call it seems to be simply each column divided by its sum.
B.norm <- t(t(B) / colSums(B))
round(B.norm, 3)
# A B C D
# A 0.030 0.018 0.021 0.037
# B 0.121 0.070 0.047 0.079
# C 0.273 0.281 0.187 0.177
# D 0.576 0.632 0.746 0.707
There are 3 definitions of percentile:
lowest number greater than x% of y numbers
smallest number greater than or equal to x% of y numbers
weighted mean of the percentiles from 1 & 2
Which quantile() argument type match these three definitions?
If by "quartile" you mean quantile(): None of them. It's not quite that simple. As the documentation says when you try: help(quantile):
One of the nine quantile algorithms discussed in Hyndman and Fan (1996),
selected by type, is employed.
The paper can be found here:
https://www.researchgate.net/profile/Rob_Hyndman/publication/222105754_Sample_Quantiles_in_Statistical_Packages/links/02e7e530c316d129d7000000.pdf
It's worth reading to get an idea of what is involved in getting a computer to do something "intuitive". :)
You can get a feel for quantile's behaviour by trying this and fiddling with the numbers for prbs:
aa <- 1: 10
prbs <- c(0.2, 0.22, 0.29)
for(typ in 1:9){
this_line <- paste0("type=", typ)
this_val <- paste0("qval=",quantile(aa, probs=prbs, type=typ))
print(paste(this_line,this_val))
}
Which gives the lines:
[1] "type=1 qval=2" "type=1 qval=3" "type=1 qval=3"
[1] "type=2 qval=2.5" "type=2 qval=3" "type=2 qval=3"
[1] "type=3 qval=2" "type=3 qval=2" "type=3 qval=3"
[1] "type=4 qval=2" "type=4 qval=2.2" "type=4 qval=2.9"
[1] "type=5 qval=2.5" "type=5 qval=2.7" "type=5 qval=3.4"
[1] "type=6 qval=2.2" "type=6 qval=2.42" "type=6 qval=3.19"
[1] "type=7 qval=2.8" "type=7 qval=2.98" "type=7 qval=3.61"
[1] "type=8 qval=2.4" "type=8 qval=2.60666666666667 "type=8 qval=3.33"
[1] "type=9 qval=2.425" "type=9 qval=2.63" "type=9 qval=3.3475"
To get the answer, I generated a random sample, used a correlation test to find a match for the three definition. It's not the most elegant code but....
Here's the code:
##### program to test types
## generate 100 random samples of 100 numbers
set.seed(3)
x <- rnorm(100000,mean = 50, sd = 10)
means <- replicate(100, sample(x, 100))
#create answer matrix
answers <- matrix(ncol=12)
colnames(answers) <- (c("def1","def2","def3","q1","q2","q3","q4","q5","q6","q7","q8","q9"))
printallper <- function(x,bar) {
# get values for per calcs
bar <- sort(bar)
per <- (x/100)*(length(bar)+1)
# get per1
perres1 <<-round(bar[per+1],digits=2)
# get per2
perres2 <<-round(bar[per], digits=2)
#get per3
whole <- floor(per)
dec <- per - whole
low <- bar[per]
high <- bar[per+1]
final <- (dec * (high-low)) + bar[per]
perres3 <<-round(final, digits=2)
# q types
q1 <- round(quantile(bar,(x/100), type = 1), digits = 2)
q2 <-round( quantile(bar,(x/100), type = 2), digits = 2)
q3 <- round(quantile(bar,(x/100), type = 3), digits = 2)
q4 <- round(quantile(bar,(x/100), type = 4), digits = 2)
q5 <- round(quantile(bar,(x/100), type = 5), digits = 2)
q6 <- round(quantile(bar,(x/100), type = 6), digits = 2)
q7 <- round(quantile(bar,(x/100), type = 7), digits = 2)
q8 <- round(quantile(bar,(x/100), type = 8), digits = 2)
q9 <- round(quantile(bar,(x/100), type = 9), digits = 2)
answers <<- rbind(answers,c(perres1,perres2,perres3,q1,q2,q3,q4,q5,q6,q7,q8,q9))
}
#run all percentiles for data in means matrix
apply(means,1,function(x) printallper(25,x))
# correlate various percentiles
cor_answers <- cor(answers[complete.cases(answers),])
#print correlations for 3 deifinitions of percentils with quantiles
cor_answers[1:3,]
The result:
def1 def2 def3 quan1 quan2 quan3 quan4 quan5 quan6 quan7 quan8 quan9
def1 1.0000 0.9763 0.9867 0.9763 0.9941 0.9763 0.9763 0.9941 0.9867 0.9985 0.9920 0.9763
def2 0.9763 1.0000 0.9984 1.0000 0.9939 1.0000 1.0000 0.9939 0.9984 0.9864 0.9958 0.9926
def3 0.9867 0.9984 1.0000 0.9984 0.9984 0.9984 0.9984 0.9984 1.0000 0.9939 0.9993 0.9984
The results show:
percentile definition 1 (def1) matches no quantile type
percentile definition 2 (def2) = matches quantile type 1,3, and 4
percentile definition 3 (def3) = matches quantile type 6