Cumulative pnorm in R - r

I'm looking to calculate a cumulative pnorm through as series.
set.seed(10)
df = data.frame(sample = rnorm(10))
# head(df)
# sample
# 1 0.01874617
# 2 -0.18425254
# 3 -1.37133055
# 4 -0.59916772
# 5 0.29454513
# 6 0.38979430
I would like the result to be
# na
# 0.2397501 # last value of pnorm(df$sample[1:2],mean(df$sample[1:2]),sd(df$sample[1:2]))
# 0.1262907 # last value of pnorm(df$sample[1:3],mean(df$sample[1:3]),sd(df$sample[1:3]))
# 0.4577793 # last value of pnorm(df$sample[1:4],mean(df$sample[1:4]),sd(df$sample[1:4]))
# .
# .
# .
if we can do this preferable in data.table, it would be nice.

You can do:
set.seed(10)
df = data.frame(sample = rnorm(10))
foo <- function(n, x) {
if (n==1) return(NA)
xn <- x[1:n]
tail(pnorm(xn, mean(xn), sd(xn)), 1)
}
sapply(seq(nrow(df)), foo, x=df$sample)
The way of calculation is similar to Calculating cumulative standard deviation by group using R
result:
#> sapply(seq(nrow(df)), foo, x=df$sample)
# [1] NA 0.23975006 0.12629071 0.45777934 0.84662051 0.83168998 0.11925118 0.50873996 0.06607348 0.63103339You can put the result in your dataframe:
df$result <- sapply(seq(nrow(df)), foo, x=df$sample)
Here is a compact version of the calculation (from #lmo)
c(NA, sapply(2:10, function(i) tail(pnorm(df$sample[1:i], mean(df$sample[1:i]), sd(df$sample[1:i])), 1)))

Related

How to run a function with multiple arguments of varying length in a loop in R

I need to run this function like 6000 times with all of its iterations. I have 6 arguments in total for the function. The first 3 of them go hand in hand and number 75. The next argument has 9 values. And the last 2 arguments have 3 values.
#require dplyr
#data is history as list
matchloop <- function(data, data2, x, a, b, c) {
#history as list
split <- data
#history for reference
fh <- FullHistory
#start counter
n<-1
#end counter
m<-a
tempdf0.3 <- fh
#set condition for loop
while(nrow(tempdf0.3) > 1 && m <= (nrow(data2))*b) {
#put history into a variable
tempdf0.0 <- split
#put fh into a variable
tempdf0.5 <- fh
#put test path into variable from row n to m
tempdf0.1 <- as.data.frame(data2[n:m,], stringsAsFactors = FALSE)
#change column name of test path
colnames(tempdf0.1) <- "directions"
#put row n to m of history into variable
tempdf0.2 <- lapply(tempdf0.0, function(df) df[n:m,])
#put output into output
tempdf0.3 <- orderedDistancespos(tempdf0.2, tempdf0.1,
"allPaths","directions")
#add to output routeID based on reference from fh-the test path ID
tempdf0.3 <- mutate(tempdf0.3, routeID = (subset(tempdf0.5, routeID
!= x)$routeID))
#reduce output based on the matched threshold
tempdf0.3 <- subset(tempdf0.3, dists >= a*c)
#create new history based on the IDs remaining in output
split <- split[as.character(tempdf0.3$routeID)]
#create new history for reference based on the IDs remaining in
output
fh <- subset(fh, routeID %in% tempdf0.3$routeID)
#increase loop counter
n <- n+a
#increase loop counter
m <- n+(a-1)
}
#show output
mylist <- list(tempdf0.3, nrow(tempdf0.3))
return(mylist)
}
I tried putting the 3 arguments with 75 elements in them to their own lists and use mapply. This works. But even at this level I still have to run the code 81 times to cover all the variables because as far as I understand mapply recycles based on the length of the longest argument.
mapply(matchloop, mylist2,mylist3,mylist4, MoreArgs = list(a=a, b=b, c=c))
data is a list of dataframes
data2 is a dataframe
x, a, b, c are all numerical.
Right now I'm trying to streamline my output so that its in just 1 line. So if possible I would like 1 single csv output with all 6000+ lines.
You can combine mapply and apply function to cycle through all possible combination of a, b and c variables. To create all possible combinations you can use expand.grid. Finally you can contatenate list of rows into a data.frame with the help of do.call and rbind functions as follows:
matchloop_stub <- matchloop <- function(data, data2, x, a, b, c) {
# stub
c(d = sum(data), d2 = sum(data2), x = sum(x), a = a, b = b, c = c, r = a + b + c)
}
set.seed(123)
mylist2 <- replicate(75, data.frame(rnorm(1)))
mylist3 <- replicate(75, data.frame(rnorm(2)))
mylist4 <- replicate(75, data.frame(rnorm(3)))
a <- 1:9
b <- 1:3
c <- 1:3
abc <- expand.grid(a, b, c)
names(abc) <- c("a", "b", "c")
xs <- apply(abc, 1, function(x) (mapply(matchloop_stub, mylist2, mylist3, mylist4, x[1], x[2], x[3], SIMPLIFY = FALSE)))
df <- do.call(rbind, do.call(rbind, xs))
write.csv(df, file = "temp.csv")
res <- read.csv("temp.csv")
nrow(res)
# [1] 6075
head(res)
# X d d2 x a b c r
# 1 1 -0.5604756 0.7407984 -1.362065 1 1 1 3
# 2 2 -0.5604756 0.7407984 -1.362065 2 1 1 4
# 3 3 -0.5604756 0.7407984 -1.362065 3 1 1 5
# 4 4 -0.5604756 0.7407984 -1.362065 4 1 1 6
# 5 5 -0.5604756 0.7407984 -1.362065 5 1 1 7
# 6 6 -0.5604756 0.7407984 -1.362065 6 1 1 8

Combine lists in forloop

I have a list of chromosomes chromosomes <- c(1:2, "X", "Y") that I am iterating over to generate random data n times for each chromosome.
I am doing this first by iterating over the chromosomes and generating the data using generateData() and then adding these to a list which I then combine into a data frame outside of the loop using bp_data <- as.data.frame(do.call(rbind, simByChrom)):
chromosomes <- c(1:2, "X", "Y")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
bp_data <- generateData(c, n)
simByChrom[[c]] <- bp_data
}
bp_data <- as.data.frame(do.call(rbind, simByChrom))
rownames(bp_data) <- NULL
# generate dummy data
generateData <- function(c, n){
df <- data.frame(chrom = rep(c, n),
pos= sample(1:10000, n))
return(df)
}
chrom pos
1 1 7545
2 2 5798
3 2 3863
4 3 4036
5 3 9347
6 3 4749
I would like to iterate over this multiple times and record the iteration number in bp_data$iteration, to produce a data frame that looks like this:
chrom pos iteration
1 7215 1
1 4606 1
2 8282 1
2 3501 1
2 4350 1
2 6044 1
X 2467 1
Y 2816 1
Y 8848 1
Y 2304 1
Y 4235 1
1 3760 2
1 8205 2
1 4735 2
2 3061 2
X 56 2
X 1722 2
X 2430 2
X 6749 2
X 2081 2
Y 9646 2
However, I'm unsure how to do this. I've tried:
iterations <- 2
for (i in (1:iterations)){
cat("Running iteration", i, "\n")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
bp_data <- generateData(c, n)
bp_data$iteration <- i
simByChrom[[c]] <- bp_data
# or
# simByChrom[[c]][[i]] <- bp_data
# or
# simByChrom[[c]] <- bp_data
# simByChrom[[c]]$iteration <- i
}
bp_data <- as.data.frame(do.call(rbind, simByChrom))
rownames(bp_data) <- NULL
}
But this results in only the last iteration being recorded.
Can anyone suggest how I can achieve my desired result?
The reason you are only seeing the last iteration in your result is because bp_data is being over-written each time through the for loop. You need to make sure you save each iteration result separately and then combine them together at the end.
I believe just a few minor adjustments to what you already have will do the trick:
iterations <- 2
#create empty list to store each iteration result
bp_data <- list()
#run each iteration
for (i in 1:iterations){
cat("Running iteration", i, "\n")
simByChrom <- list()
for (c in chromosomes){
n <- sample(1:5,1)
cat(paste("Simulating", n, "breakpoints on chromosome", c), "\n")
aa <- generateData(c, n)
aa$iteration <- i
simByChrom[[c]] <- aa
}
result <- as.data.frame(do.call(rbind, simByChrom))
rownames(result) <- NULL
bp_data[[i]] <- result
}
#combine each iteration into one data frame
final <- as.data.frame(do.call(rbind, bp_data))

min max scaling/normalization in r for train and test data

I am looking to create a function that takes in the training set and the testing set as its arguments, min-max scales/normalizes and returns the training set and uses those same values of minimum and range to min-max scale/normalize and return the test set.
So far this is the function I have come up with:
min_max_scaling <- function(train, test){
min_vals <- sapply(train, min)
range1 <- sapply(train, function(x) diff(range(x)))
# scale the training data
train_scaled <- data.frame(matrix(nrow = nrow(train), ncol = ncol(train)))
for(i in seq_len(ncol(train))){
column <- (train[,i] - min_vals[i])/range1[i]
train_scaled[i] <- column
}
colnames(train_scaled) <- colnames(train)
# scale the testing data using the min and range of the train data
test_scaled <- data.frame(matrix(nrow = nrow(test), ncol = ncol(test)))
for(i in seq_len(ncol(test))){
column <- (test[,i] - min_vals[i])/range1[i]
test_scaled[i] <- column
}
colnames(test_scaled) <- colnames(test)
return(list(train = train_scaled, test = test_scaled))
}
The definition of min max scaling is similar to this question asked earlier on SO - Normalisation of a two column data using min and max values
My questions are:
1. Is there a way to vectorize the two for loops in the function? e.g. using sapply()
2. Are there any packages that allow us to do what we are looking to do here?
Here is the code for the min-max normalization. See this Wikipedia page for the formulae, and also other ways of performing feature scaling.
normalize <- function(x, na.rm = TRUE) {
return((x- min(x)) /(max(x)-min(x)))
}
To get a vector, use apply instead of lapply.
as.data.frame(apply(df$name, normalize))
Update to address Holger's suggestion.
If you want to pass additional arguments to min() and max(), e.g., na.rm, then you can use:
normalize <- function(x, ...) {
return((x - min(x, ...)) /(max(x, ...) - min(x, ...)))
}
x <- c(1, NA, 2, 3)
normalize(a)
# [1] NA NA NA NA
normalize(a, na.rm = TRUE)
# 0.0 NA 0.5 1.0
Just keep in mind, that whatever you pass to min() via the ellipsis ... you also implicitly pass to max(). In this case, this shouldn't be a big problem since both min() and max() share the same function signature.
Regarding your 2nd question, you can use the caret package:
library(caret)
train = data.frame(a = 1:3, b = 10:12)
test = data.frame(a = 1:6, b = 7:12)
pp = preProcess(train, method = "range")
predict(pp, train)
# a b
# 1 0.0 0.0
# 2 0.5 0.5
# 3 1.0 1.0
predict(pp, test)
# a b
# 1 0.0 -1.5
# 2 0.5 -1.0
# 3 1.0 -0.5
# 4 1.5 0.0
# 5 2.0 0.5
# 6 2.5 1.0
This packages also defines other transformation methods, see: http://machinelearningmastery.com/pre-process-your-dataset-in-r/
set.seed(1984)
### simulating a data set
df <- data.frame(var1 = rnorm(100,5,3),
var2 = rpois(100,15),
var3 = runif(50,90,100))
df_train <- df[1:60,]
df_test <- df[61:100,]
## the function
normalize_data <- function(train_set, test_set) ## the args are the two sets
{
ranges <- sapply(train_set, function(x) max(x)-min(x)) ## range calculation
normalized_train <- train_set/ranges # the normalization
normalized_test <- test_set/ranges
return(list(ranges = ranges, # returning a list
normalized_train= normalized_train,
normalized_test =normalized_test ))
}
z <- normalize_data(df_train, df_test) ## applying the function
## the results
z$ranges
var1 var2 var3
13.051448 22.000000 9.945934
> head(z$normalized_train)
var1 var2 var3
1 0.47715854 1.1492978 7.289028
2 0.18322387 0.4545455 4.280883
3 0.69451066 1.3070668 9.703761
4 -0.04125108 1.6090169 7.277882
5 0.35731555 0.7272727 4.133561
6 0.86120315 0.6032616 9.246209
> head(z$normalized_train)
var1 var2 var3
1 0.47715854 1.1492978 7.289028
2 0.18322387 0.4545455 4.280883
3 0.69451066 1.3070668 9.703761
4 -0.04125108 1.6090169 7.277882
5 0.35731555 0.7272727 4.133561
6 0.86120315 0.6032616 9.246209

Standard deviation for each row value and constant value

N <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
dat1 <- data.frame(N,May = a, April = b,June = w)
N May April June
1 1 3 4 5
2 3 4 5 6
3 4 5 6 7
4 6 6 7 6
I need a data frame, where each value is sd of N value and row value
sd(c(1,3) sd(c(1,4) sd(c(1,5) # for 1st row
sd(c(3,4) sd(c(3,5) sd(c(3,6) # for second and so on.
Try this:
The data:
Norm <- c(1,3,4,6)
a <- c(3,4,5,6)
b <- c(4,5,6,7)
w <- c(5,6,7,6)
mydata <- data.frame(Norm=Norm,May = a, April = b,June = w)
Solution:
finaldata <- do.call('cbind',lapply(names(mydata)[2:4], function(x) apply(mydata[c("Norm",x)],1,sd)))
I hope it helps.
Piece of advice:
Please refrain from using names like data and norm for your variable names. They can easily conflict with things that are native to R. For example norm is a function in R, and so is data.
I think I got it
x=matrix(data=NA, nrow=4, ncol=3)
for(j in 1:3){
for(i in 1:4){
x[i, j] <- sd(data[i, c(i,(j+1))])
x
}
}

Fill NA values with the trailing row value times a growth rate?

What would be a good way to populate NA values with the previous value times (1 + growth)?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
In this case, I would want the missing values in price1 to be filled with 4.8*1.02, 4.8*1.02^2, and 4.8*1.02^3. Similarly, I would want the missing values in price2 to be filled with 3.2*1.02, 3.2*1.02^2, 3.2*1.02^3, and 3.2*1.02^4.
I've tried this, but I think it needs to be set to repeat somehow (apply?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
I'm not using dplyr for anything else (yet), so something from base R or plyr or similar would be appreciated.
Assuming only trailing NAs:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
If there are interior NA values as well this would get a little trickier.
Apply to all columns except the first:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
A compact base R solution can be obtained using Reduce:
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
giving:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
Note: The data in the question has no non-trailing NA values but if there were some then we could use na.fill from zoo to first replace the trailing NAs with a special value, such as NaN, and look for it instead of NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))
The following solution based on rle works with NA in any position and does not rely on looping to fill in the missing values:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
I'll drop in two additional solutions using for loops, one in base R and one in Rcpp:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
The solutions based on rle (crimson and josilber.rle) take about twice as long as the simple solution based on a for loop (josilber.for), and as expected the Rcpp solution is the fastest, running in about 0.002 seconds.
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
It looks like dplyr can't handle access newly assigned lag values. Here is a solution that should work even if the NA's are in the middle of a column.
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
You can try such function
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798

Resources