Correlation in R - r

I am trying to produce same graph as in example but using different data. Here is my code:
library(SciViews)
args <- commandArgs(TRUE)
pdfname <- args[1]
datafile <- args[2]
pdf(pdfname)
eqdata = read.csv(datafile , header = T,sep=",")
(longley.cor <- correlation(eqdata$feqs))
# Synthetic view of the correlation matrix
summary(longley.cor)
p <- plot(longley.cor)
print(p)
dev.off()
and the data
ques,feqs
"abc",20
"def",10
"ghi",40
"jkl",10
"mno",20
"pqr",10
I use this command
Rscript ./rscript/correlation.R "/home/co.pdf" "/home/data_correlation.csv"
Code output
I want to generate like this

You can try the plotcorr function in the ellipse package. The help pages gives among others this example:
Which seems to be what you are looking for?
Edit:
You can add text afterwards, the circles are placed on a 1 - number of vars grid. E.g.:
data(mtcars)
Corrmat <- cor(mtcars)
cols <- ifelse(Corrmat>0, rgb(0,0,abs(Corrmat)), rgb(abs(Corrmat),0,0))
library(ellipse)
plotcorr(Corrmat,col=cols)
n <- nrow(Corrmat)
for (i in 1:n)
{
for (j in 1:n)
{
text(j,i,round(Corrmat[n-i+1,j],2),col="white",cex=0.6)
}
}

Related

How can I transform the following script into a function for a package?

I have the following data as an example:
IID<-c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4)
KB<-c(345,1234,2000,1567,
376,1657,9656,234,
1865,1565,123,111,
1999,2032,1565,234)
data<-cbind(IID,KB)
I wrote a script to process it and give me some outcomes:
results_kb <- function(class) {
this_iids_roh <- dat[class,]
my_list<-c("Sum_long"=sum(this_iids_roh$KB[this_iids_roh$KB>=1500]),
"N_long"=length(this_iids_roh$KB[this_iids_roh$KB>=1500]),
"Sum_short"=sum(this_iids_roh$KB[this_iids_roh$KB<1500]),
"N_short"=length(this_iids_roh$KB[this_iids_roh$KB<1500]))
return(my_list)
}
dat <- data.table::as.data.table(data)
dat$IID<-as.factor(dat$IID)
data.table::setkey(dat,"IID")
results <- c()
nLevels <- length(levels(dat$IID))
start <- proc.time()
pb <- txtProgressBar(min = 0, max = nLevels, style = 3)
for (i in 1:nLevels){
this_iid <- levels(dat$IID)[i]
results <- rbind(results,results_kb(this_iid))
setTxtProgressBar(pb,i)
}
close(pb)
proc.time()-start
results<-data.frame(levels(dat$IID),results)
results$IID<-results$levels.dat.IID.
results[results==0] <- NA
I created first a function that gives me the outcome table and then I processed the dataset I want to analyze. I added a time indicator because this script is to process very large samples. As you can test it works perfectly.
Now I want to write my own package, so I have to reduce all this into functions. The first part of the script is already in a function. For the second part I have tried the following:
rohsum<-function(data){
dat <- data.table::as.data.table(data)
dat$IID<-as.factor(dat$IID)
data.table::setkey(dat,"IID")
results <- c()
nLevels <- length(levels(dat$IID))
start <- proc.time()
pb <- txtProgressBar(min = 0, max = nLevels, style = 3)
for (i in 1:nLevels){
this_iid <- levels(dat$IID)[i]
results <- rbind(results,results_kb(this_iid))
setTxtProgressBar(pb,i)
}
close(pb)
proc.time()-start
results<-data.frame(levels(dat$IID),results)
results$IID<-results$levels.dat.IID.
results[results==0] <- NA
return(results)
}
However this seems not to work, since when I tried to run rohsum(data) I get the following error message:
Error in results_kb(this_iid) : object 'dat' not found
Even more, If I tried to run several times I get the following:
How can I solve this issue to be able to build my own package?

Using foreach instead of for loop

I am trying to learn foreach to parallelise my task
My for-loop looks like this:
# create an empty matrix to store results
mat <- matrix(-9999, nrow = unique(dat$mun), ncol = 2)
for(mun in unique(dat$mun)) {
dat <- read.csv(paste0("data",mun,".csv")
tot.dat <- sum(dat$x)
mat[mat[,1]== mun,2] <- tot.dat
}
unique(dat$mun) has a length of 5563.
I want to use foreach to pararellise my task.
library(foreach)
library(doParallel)
# number of iterations
iters <- 5563
foreach(icount(iters)) %dopar% {
mun <- unique(dat$mun)[mun] # this is where I cannot figure out how to assing mun so that it read the data for mun
dat <- read.csv(paste0("data",mun,".csv")
tot.dat <- sum(dat$x)
mat[mat[,1]== mun,2] <- tot.dat
}
This could be one solution.
Do note that I'm using windows here, and i specified registerDoParallel() for it to work.
library(foreach)
library(doParallel)
# number of iterations
iters <- 5563
registerDoParallel()
mun <- unique(dat$mun)
tableList <- foreach(i=1:iters) %dopar% {
dat <- read.csv(paste0("data",mun[i],".csv")
tot.dat <- sum(dat$x)
}
unlist(tableList)
Essentially, whatever result inside {...} will be stored inside a list.
In this case, the result (tot.dat which is a number) is compiled in tableList, and by performing unlist() we can convert it to a vector for further use.
The result inside {...} can be anything, a single number, a vector, a dataframe, or anything.
Another approach for your problem would be to combine all existing data together, labelling it with its appropriate source file, so the middle component will look something like
library(plyr)
tableAll <- foreach(i=1:iters) %dopar% {
dat <- read.csv(paste0("data",mun[i],".csv")
dat$source = mun[i]
}
rbind.fill(tableAll)
Then we can use it for further analysis.

R issues returning output from function using loop

I am having issues returning output from a function I created in R when I use it in a loop. I am trying to combine the output form multiple MCMC models into one R object.
The function:
get_scrUN_output <- function(filename){
out <- filename
nam<-c("sigma","lam0","psi", "N")
nam<-match(nam,dimnames(out[[1]]$sims)[[2]])
out.lst<-mcmc.list(
as.mcmc(out[[1]]$sims[200001:300000,nam]),
as.mcmc(out[[2]]$sims[200001:300000,nam]),
as.mcmc(out[[3]]$sims[200001:300000,nam]))
s <- summary(out.lst)
gd <- gelman.diag(out.lst,multivariate = FALSE)
output_table <- rbind(as.data.frame(t(s$statistics)),
as.data.frame(t(s$quantiles)),
as.data.frame(t(gd$psrf)))
return(output_table) }
The code I use to create a list of RData mcmc outputs to run through the function:
scrUN.ET <- list.files(getwd(),"out.*ET.RData")
scrUN.lst <- as.vector(substring(scrUN.ET,1))
scrUN.lst <- str_sub(scrUN.lst, 1, str_length(scrUN.lst)-3)
>scrUN.lst
[1] "BBout11FL" "BBout11TL" "BBout12TL" "BBout13FL" "BBout13TL"
When I use the function on an individual output file, it works:
get_scrUN_output(BBout11FL)
sigma lam0 psi N
Mean 130.43594323 14.5319368 0.3361405211 335.8042733
SD 7.28386725 9.7311139 0.2743725813 274.6828277
Naive SE 0.01329846 0.0177665 0.0005009335 0.5014999
Time-series SE 1.28032869 1.3886577 0.0360607870 36.5692414
2.5% 118.37718370 0.6129902 0.0300165600 30.0000000
25% 124.29743884 5.7535456 0.0958156210 95.0000000
50% 130.40628214 15.1264454 0.2426328827 242.0000000
75% 135.99836262 19.9685209 0.5403864215 541.0000000
97.5% 145.11615201 34.9438198 0.9298185748 930.0000000
Point est. 1.59559993 4.4590599 1.0677998255 1.0678381
Upper C.I. 2.56854388 9.5792520 1.2186078069 1.2186933
But when I try to run all output files through the function using a loop I get a NULL output.
scrUN.output <- rbind(
for (i in seq_along(scrUN.lst)){
get_scrUN_output(get(scrUN.lst[i]))
}
)
>scrUN.output
NULL
Thanks!
The reason is you're rbind-ing nothing.
Here's a simplified example demonstrating what your code above is doing –– the for loop isn't assigning anything to a variable which is why you're getting NULL at the end.
xx <- rbind(
for(i in c(1,2)){
i
}
)
print(xx) # NULL
Try this instead:
scrUN.output <- list() # initialize a list
for (i in seq_along(scrUN.lst)){
# update the list contents
scrUN.output[[i]] <- get_scrUN_output(get(scrUN.lst[i]))
}
# finally, rbind eveything
scrUN.output <- do.call(rbind, scrUN.output)
Or better yet, use lapply:
scrUN.output <- lapply(scrUN.lst, get_scrUN_output)
scrUN.output <- do.call(rbind, scrUN.output)
I think this is what you're asking for. This is an edit of the final code section. Your were using rbind on nothing since nothing is being returned by the for-loop.
scrUN.output <- lapply(scrUN.lst, function(i) get_scrUN_output(get(i)))
scrUN.output <- do.call(rbind, scrUN.output)
scrUN.output

How to show the progress bar in raster calc function?

how I can use the progress bar like this example inside of a function written for use for use in calc raster function?
I have a huge dataset to process and I desire to use the progress bar to control the duration of process. I tryied to use like this, the function (of process) works prefectly, but, do not show the progress bar.
# PROGRESS BAR IN CALC RASTER EXAMPLE
# create data
r <- raster(nrow=10, ncol=10)
dataset <- list()
for (i in 1:20) {
dataset[i] <- setValues(r, rnorm(ncell(r), i, 3) )
}
dataset <- stack(dataset)
## function to apply
pixel <-getValuesBlock(s1, row=1, nrows=1, col=1, ncols=1, lyrs=1:nlayers(s1))
CropAnalysis <- function (pixel, ...){
gc()
pb <- txtProgressBar(...)
# test : if is No data the return is
if (identical(x = is.na(pixel), y = rep(TRUE,length(pixel)))) {NA}else{
averageOfhigher <- mean(pixel[pixel > 10], na.rm=T)
averageOflower <- mean(pixel[pixel < 10], na.rm=T)
return(c(averageOfhigher, averageOflower))
}
setTxtProgressBar(pb)}
# applying calc finction
data_process<-calc(x=dataset, fun=CropAnalysis, forcefun=TRUE, forceapply=TRUE)
You can use the progress argument that is build in to most functions in the raster package. It is only shown when writing in chunks (because the dataset is large).
# example data
library(raster)
r <- raster(nrow=10, ncol=10)
d <- stack(lapply(1:20, function(i) setValues(r, rnorm(ncell(r), i, 3) )) )
f <- function(pixel, ...){
if (all(is.na(pixel))) {
c(NA, NA) # note the two NAs to match the other case
} else {
averageOfhigher <- mean(pixel[pixel > 10], na.rm=TRUE)
averageOflower <- mean(pixel[pixel < 10], na.rm=TRUE)
c(averageOfhigher, averageOflower)
}
}
Do not use the line below in a script. It is only needed in this
toy example to trigger writing in chunks such that the progress bar appears
rasterOptions(todisk=TRUE)
But do use the progress argument (either "text" or "window")
r <- calc(d, fun=f, progress='text')
One simple solution
you can use the rasterOptions function that is built in the raster package.
as an example
rasterOptions(progress = 'text',timer=TRUE) will show you the progress, as the example you showed, and the time for each used function from the raster package.
check this link for more advanced options:
https://rdrr.io/cran/raster/man/rasterOptions.html

How do I use $ for output components in R?

First, my code works perfectly. I simply need to be able to call the year and seasonal components out of BestSolarData using $ with:
BestSolarData$year
BestSolarData$seasonal
I have these written at the end of my code. The year I know comes from BestYear and seasonal come from BestData in the ForLoopSine function.
Any help to be able to access the components using $?
SineFit <- function (ToBeFitted)
{
msvector <- as.vector(ToBeFitted)
y <- length(ToBeFitted)
x <- 1:y
MS.nls <- nls(msvector ~ a*sin(((2*pi)/12)*x+b)+c, start=list(a=300, b=0, c=600))
summary(MS.nls)
MScoef <- coef(MS.nls)
a <- MScoef[1]
b <- MScoef[2]
c <- MScoef[3]
x <- 1:12
FittedCurve <- a*sin(((2*pi)/12)*x+b)+c
#dev.new()
#layout(1:2)
#plot(ToBeFitted)
#plot(FittedCurve)
return (FittedCurve)
}
ForLoopSine <- function(PastData, ComparisonData)
{
w<-start(PastData)[1]
t<-end(PastData)[1]
BestDiff <- 9999
for(i in w:t)
{
DataWindow <- window(PastData, start=c(i,1), end=c(t,12))
Datapredict <- SineFit(DataWindow)
CurrDiff <- norm1diff(Datapredict, ComparisonData)
if (CurrDiff < BestDiff)
{
BestDiff <- CurrDiff
BestYear <- i
BestData <- Datapredict
}
}
print(BestDiff)
print(BestYear)
return(BestData)
}
RandomFunction <- function(PastData, SeasonalData)
{
w <- start(PastData)[1]
t <- end(PastData)[1]
Seasonal.ts <- ts(SeasonalData, st = c(w,1), end = c(t,12), fr = 12)
Random <- PastData-Seasonal.ts
layout(1:3)
plot(SeasonalData)
plot(Seasonal.ts)
plot(Random)
return(Random)
}
BestSolarData <- ForLoopSine(MonthlySolarPre2015, MonthlySolar2015)
RandomComp <- RandomFunction (MonthlySolarPre2015, BestSolarData)
acf(RandomComp)
BestSolarData$year
BestSolarData$seasonal
As far as I understand your problem, you would like to retrieve the year component of BestSolarData with BestSolarData$year. But BestSolarData is returned by ForLoopSine, which is itself named DataPredict and is returned the SineFit function. It seems to be a vector and not a data.frame, so $ cannot work here.
Your example is not reproducible and this may help you find a solution. See this post for more details.

Resources