I am creating a panel data frame. It is a panel of schools. To this panel I want to merge the first closest weather station, then the second, third, etc until the 10th closest one. I wrote a loop that does this for different variables: maximum temperature, minimum temperature, precipitation, etc. The issue that I am having is that it seems that I am unnecessarily allocating memory somewhere inside this loop since I run out of memory.
I know I have enough memory to create the panel since I did it once already without the loop. I am working on windows on 64 bit with 8gb of RAM. I have a sample of 7800 schools, and 800 weather stations for the 2010-2015 period.
This is a reproducible example with only 5 schools, 10 weather stations and 2 months of data and matching only the 3 closest stations. The real example is 7800 schools, 800 weather stations, 5 years of data and matching the 10 closest stations.
library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")
#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
dist<-paste0("Dist_",i)
dist<-get(dist)
dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]
matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
names(matr)[6]<-paste0(i,"_T_1")
Sys.sleep(0.1)
print(i)
for(n in 2:3) {
matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
matr3<-expand.grid(matr2$ID_School,years)
names(matr3)<-c("ID_School","Date")
matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
matr2<-merge(matr3,matr2,by="ID_School")
rm(matr3)
Sys.sleep(0.1)
print(i)
matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
matr2<-matr2[c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")
names(matr2)[6]<-paste0(i,"_T_",n)
matr<-merge(matr,matr2[,c("Cod_Merge",
paste0("ID_Station_",n),
paste0("Dist_",n),
paste0(i,"_T_",n))],
by="Cod_Merge",all.x=T)
Sys.sleep(0.1)
print(i)
}
assign(paste0("Mat_Dist_",i),matr)
}
Any help would be greatly appreciated.
Solution
For anyone who is interested, I was missing a couple of commas inside the 2nd loop:
library(data.table)
Dist_Temp_Max<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Temp_Min<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
Dist_Prec<-data.frame(ID_School=seq(1:5),ID_Station_1=floor(runif(5, min=0, max=10)),Dist_1=floor(runif(5, min=0, max=10)),
ID_Station_2=floor(runif(5, min=0, max=10)),Dist_2=floor(runif(5, min=11, max=100)),
ID_Station_3=floor(runif(5, min=0, max=10)),Dist_3=floor(runif(5, min=101, max=200)))
years<-seq.Date(as.Date("2014-01-01"),as.Date("2015-02-28"),by="1 day")
Weather_Data<-data.frame(ID_School=seq(1:5))
Weather_Data<-expand.grid(Weather_Data$ID_School,years)
names(Weather_Data)<-c("ID_Station","Date")
Weather_Data$Temp_Max_T<-runif(nrow(Weather_Data), min=10, max=40)
Weather_Data$Temp_Min_T<-Weather_Data$Temp_Max-10
Weather_Data$Prec_T<-floor(runif(nrow(Weather_Data),min=0, max=10))
Weather_Data$Cod_Merge<-paste(Weather_Data$ID_Station,Weather_Data$Date,sep="-")
#Add Values per Station
var_list<-c("Temp_Max","Temp_Min","Prec")
for (i in var_list) {
dist<-paste0("Dist_",i)
dist<-get(dist)
dist<-as.data.frame(subset(dist,!is.na(dist$ID_Station_1)))
matr<-dist[c("ID_School","ID_Station_1","Dist_1")]
matr<-setDT(matr)[, list(Date=years,ID_Station_1=ID_Station_1,Dist_1=Dist_1) , ID_School]
matr$Cod_Merge<-paste(matr$ID_Station_1,matr$Date,sep="-")
matr<-as.data.frame(matr[,c("Cod_Merge","ID_School","Date","ID_Station_1","Dist_1")])
matr<-merge(matr,Weather_Data[c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr$Cod_Merge<-paste(matr$ID_School,matr$Date,sep="-")
names(matr)[6]<-paste0(i,"_T_1")
Sys.sleep(0.1)
print(i)
for(n in 2:3) {
matr2<-dist[c("ID_School",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-subset(dist,!is.na(dist[paste0("ID_Station_",n)]))
matr3<-expand.grid(matr2$ID_School,years)
names(matr3)<-c("ID_School","Date")
matr3<-matr3[order(matr3$ID_School,matr3$Date), ]
matr2<-merge(matr3,matr2,by="ID_School")
rm(matr3)
Sys.sleep(0.1)
print(i)
matr2$Cod_Merge<-paste(matr2[,paste0("ID_Station_",n)],matr2$Date,sep="-")
matr2<-matr2[,c("Cod_Merge","ID_School","Date",paste0("ID_Station_",n),paste0("Dist_",n))]
matr2<-merge(matr2,Weather_Data[,c("Cod_Merge",paste0(i,"_T"))],by="Cod_Merge",all.x=T)
matr2$Cod_Merge<-paste(matr2$ID_School,matr2$Date,sep="-")
names(matr2)[6]<-paste0(i,"_T_",n)
matr<-merge(matr,matr2[,c("Cod_Merge",
paste0("ID_Station_",n),
paste0("Dist_",n),
paste0(i,"_T_",n))],
by="Cod_Merge",all.x=T)
Sys.sleep(0.1)
print(i)
}
assign(paste0("Mat_Dist_",i),matr)
}
It seems that all your code would need to do is find the 10 closest stations to each school, and then you simply subset the station data to the school (don't know anything about your dates).
Your final data frame should be way better and easier to use -- probably instead of 3 separate wide data frames it should look like this:
set.seed(1) # FAKE DATA
final <- data.frame(ID_School = rep(LETTERS[1],10), ID_Station = sample(1:100,10),
Closeness_Rank = 1:10, Distance = 10*(1:10) + sample(-5:5,10),
Temp.Max = sample(70:100,10), Temp.Min = sample(30:69,10),
Precipitation = sample(20:30,10)/100)
final
# ID_School ID_Station Closeness_Rank Distance Temp.Max Temp.Min Precipitation
#1 A 27 1 7 98 49 0.29
#2 A 37 2 16 76 53 0.26
#3 A 57 3 31 88 48 0.27
#4 A 89 4 38 73 36 0.24
#5 A 20 5 50 77 59 0.23
#6 A 86 6 65 80 68 0.28
#7 A 97 7 72 70 57 0.20
#8 A 62 8 79 79 33 0.21
#9 A 58 9 94 90 64 0.22
#10 A 6 10 103 96 42 0.30
Without knowing how you measure your distances for the station and school data or other information, I can't help you get to this format but if you provide more information I would be happy to help.
EDIT:
This method seems to be very slow as I'm not really using data.tables correctly but hopefully it should give you some ideas. I've generated the fake data in a way that might be useful for you for explaining your question in the future. My method is to only build the FINAL output, a day-school data.table of weather data derived from averaging the closest 10 stations that have data weighted by inverse distance.
The process is super slow, at ~ 7800 school weather's calculated in 5 minutes for a single day... so 6 and half days to complete for 5 years -- but no memory issues! This is the kind of code you would post and ask if someone can improve the speed.
# Starting from the beginning
set.seed(100)
library(data.table)
n_station <- 800
n_school <- 7800
station_info <- data.frame(ID_Station = 1:n_station,
xcoord = sample(-10000:10000,n_station),
ycoord = sample(-10000:10000,n_station))
school_info <- data.frame(ID_School = 1:n_school,
xcoord = sample(-10000:10000,n_school),
ycoord = sample(-10000:10000,n_school))
# save list of ~20 closest stations by school,
# and always use 10 of the closest where measurements are available
x <- 20
L <- vector('list', nrow(school_info)) # always initialize for speed
for(i in 1:nrow(school_info)){
distances <- sqrt((school_info[i,"xcoord"] - station_info[,"xcoord"])^2 +
(school_info[i,"ycoord"] - station_info[,"ycoord"])^2)
L[[i]] <- cbind.data.frame(ID_School = rep(school_info[i,"ID_School"],x),
ID_Station = station_info[ which(order(distances) <= x),
"ID_Station"],
Distance_Rank = 1:x,
Distance = sort(distances)[1:x])
}
L[[1]]
# ID_School ID_Station Distance_Rank Distance
# 1: 1 2 1 127.2242
# 2: 1 32 2 365.7896
# 3: 1 92 3 573.0428
# 4: 1 141 4 763.5837
# 5: 1 151 5 1003.4127
For 5 years of daily Fake Weather Data:
days <- seq.Date(as.Date("2010-01-01"),as.Date("2015-12-31"),by="1 day")
d <- length(days)
S <- vector('list', nrow(station_info))
for(i in 1:nrow(station_info)){
S[[i]] <- data.frame(ID_Station = rep(station_info[i,"ID_Station"],d),
Temp.Max = sample(70:100,d,T),
Temp.Min = sample(30:69,d,T),
Precipitation = sample(20:30,d,T)/100,
date = days)
# maybe remove some dates at random
if(sample(c(T,F),1)) S[[i]] <- S[[i]][-sample(1:d,1),]
}
station_data <- as.data.table(do.call(rbind,S))
station_data
# ID_Station Temp.Max Temp.Min Precipitation date
# 1: 1 88 55 0.23 2010-01-01
# 2: 1 73 57 0.24 2010-01-02
# 3: 1 93 33 0.29 2010-01-03
# 4: 1 81 52 0.27 2010-01-04
# 5: 1 82 48 0.24 2010-01-05
# ---
#291610: 800 86 31 0.28 2010-12-27
#291611: 800 98 57 0.22 2010-12-28
#291612: 800 71 50 0.26 2010-12-29
#291613: 800 83 35 0.26 2010-12-30
#291614: 800 71 34 0.23 2010-12-31
The algorithm:
size <- length(days) * n_school
#OUT <- data.table(ID_School = integer(size),
# date = as.Date(x = integer(size), origin = "1970-01-01"),
# wtd_Temp.Max= numeric(size),
# wtd_Temp.Min= numeric(size),
# wtd_Precip= numeric(size))
OUT <- vector('list',size) # faster
unique_school <- unique(school_data$ID_School) # will be length(n_school)
#length(L) is the same as length(unique(school)= n_school)
count = 0
for(i in 1:length(days)){
t1 <- Sys.time()
temp_weather_data = station_data[date==days[i],]
m <- merge(school_data, temp_weather_data, "ID_Station")
setkey(m, ID_School) # the key is ID_School
for(j in 1:length(unique_school)){
count = count + 1
# assuming within the closest 20 stations, at least 10 have data every day
r <- m[.(j),][1:10] # find schools j in key
invd <- 1/r$Distance
sum.invd <- sum(invd)
OUT[[count]] <- data.table(ID_School = unique_school[j],
date = days[i],
wtd_Temp.Max = sum(invd * r$Temp.Max)/sum.invd,
wtd_Temp.Min = sum(invd * r$Temp.Min)/sum.invd,
wtd_Precip = sum(invd * r$Precipitation)/sum.invd)
if(j %% 100 == 0) cat(as.character(days[i]),".....",unique_school[j],"...\n")
}
cat(Sys.time()-t1)
}
Which gives the final output:
do.call(rbind,OUT)
# ID_School date wtd_Temp.Max wtd_Temp.Min wtd_Precip
# 1: 1 2010-01-01 88.64974 44.07872 0.2757571
# 2: 2 2010-01-01 83.34549 46.80225 0.2511073
# 3: 3 2010-01-01 85.32834 48.62004 0.2347837
# 4: 4 2010-01-01 82.95667 48.01814 0.2576482
# 5: 5 2010-01-01 87.88982 44.45357 0.2527794
# ---
So I am having a little trouble with listing inputs within functions, particularly ezANOVA(). Here is what I have for code so far:
ui.R:
library(shiny)
shinyUI(pageWithSidebar(
headerPanel('Analysis of Variance'),
sidebarPanel(
fileInput("file1", "CSV File", accept=c("text/csv", "text/comma-separated-values,text/plain", ".csv")),
checkboxInput("header", "Header", TRUE),
radioButtons('sep', 'Separator',c(Comma=',',Semicolon=';',Tab='\t'),','),
uiOutput('var')
),
mainPanel(
tableOutput('aovSummary')
)
)
)
server.R:
library(shiny)
library(ez)
shinyServer(function(input, output) {
csvfile <- reactive({
csvfile <- input$file1
if (is.null(csvfile)){return(NULL)}
dt <- read.csv(csvfile$datapath, header=input$header, sep=input$sep)
dt
})
output$var <- renderUI({
if(is.null(input$file1$datapath)){return()}
else{
return(list(radioButtons("estimate", "Please Pick The Dependent Variable", choices = names(csvfile())),
radioButtons("between1", "Please Pick The Between Subjects Factor", choices = names(csvfile())),
radioButtons("within1", "Please Pick The Within Subjects Factor", choices = names(csvfile())),
radioButtons("sid", "Please Pick The Subject Id Variable", choices = names(csvfile())),
actionButton("submit", "Submit")))
}
})
output$aovSummary = renderTable({
if(is.null(input$file1$datapath)){return()}
if(input$submit > 0){
aov.out <- ezANOVA(data = csvfile(), dv = .(input$estimate), wid = .(input$sid), between = .(input$between1),
within = .(input$within1), detailed = TRUE, type = "III")
return(aov.out)
}
})
})
Here is the data I have been testing it with:
Animal Visit Dose Estimate
2556 0 3 1.813206946
2557 0 3 1.933397744
2558 0 3 1.689893603
2559 0 3 1.780301984
2560 0 3 1.654374476
2566 0 10 3.401283412
2567 0 10 3.015958525
2568 0 10 2.808705611
2569 0 10 3.185718418
2570 0 10 2.767128836
2576 0 30 3.941412617
2577 0 30 3.793328436
2578 0 30 4.240736154
2579 0 30 3.859611218
2580 0 30 4.049743097
2586 0 100 5.600261483
2587 0 100 5.588115651
2588 0 100 5.089081008
2589 0 100 5.108262681
2590 0 100 5.343876403
2556 27 3 1.453587471
2557 27 3 1.994413484
2558 27 3 1.638132168
2559 27 3 2.138289747
2560 27 3 1.799769874
2566 27 10 3.302851871
2567 27 10 3.014199997
2568 27 10 3.190990162
2569 27 10 3.577924375
2570 27 10 3.537461068
2576 27 30 4.470837132
2577 27 30 4.081833308
2578 27 30 4.497192825
2579 27 30 4.205494309
2580 27 30 4.234496088
2586 27 100 6.054284369
2587 27 100 5.436697078
2588 27 100 5.398721492
2589 27 100 4.990794986
2590 27 100 5.573305744
2551 0 3 1.838550166
2552 0 3 1.847992942
2553 0 3 1.349892703
2554 0 3 1.725937126
2555 0 3 1.534652719
2561 0 10 2.931535704
2562 0 10 2.947599556
2563 0 10 3.092658629
2564 0 10 2.837625632
2565 0 10 2.970227467
2571 0 30 4.00746885
2572 0 30 3.921844968
2573 0 30 3.575724773
2574 0 30 4.17137839
2575 0 30 4.25251528
2581 0 100 4.785295667
2582 0 100 5.610955803
2583 0 100 5.497109771
2584 0 100 5.262724458
2585 0 100 5.430003698
2551 27 3 1.9326519
2552 27 3 2.313193186
2553 27 3 1.815261865
2554 27 3 1.345218914
2555 27 3 1.339432001
2561 27 10 3.305894401
2562 27 10 3.192621055
2563 27 10 3.76947789
2564 27 10 3.127887366
2565 27 10 3.231750087
2571 27 30 4.306556353
2572 27 30 4.232038905
2573 27 30 4.042378186
2574 27 30 4.784843929
2575 27 30 4.723665015
2581 27 100 5.601181262
2582 27 100 5.828647795
2583 27 100 5.652171222
2584 27 100 5.326512658
2585 27 100 6.009774247
The error I receive in the browser is:
"input$estimate" is not a variable in the data frame provided.
So, the function ezANOVA() is not using the actual variable name but rather the string "input$estimate", that is not what I want it to do.
How would I go about fixing this problem or is it helpless?
Thanks in advance for all your help!
You need to dynamically construct the call to ezANOVA(), i.e. use the value of the strings in your input variables to define the function call. Due to its LISP heritage, this is relatively easy in R via eval. (Relatively easy because strings are still painful in R and you need to manipulate strings to make this work). Here's a minimal working version of your code.
server.R
library(shiny)
library(ez)
shinyServer(function(input, output) {
csvfile <- reactive({
csvfile <- input$file1
if (is.null(csvfile)){return(NULL)}
dt <- read.csv(csvfile$datapath, header=input$header, sep=input$sep)
dt
})
output$var <- renderUI({
if(!is.null(input$file1$datapath)){
d <- csvfile()
anova.opts <- list(
radioButtons("estimate", "Please Pick The Dependent Variable", choices = names(d)),
radioButtons("between1", "Please Pick The Between Subjects Factor", choices = names(d)),
radioButtons("within1", "Please Pick The Within Subjects Factor", choices = names(d)),
radioButtons("sid", "Please Pick The Subject Id Variable", choices = names(d)),
actionButton("submit", "Submit")
)
anova.opts
}
})
output$aovSummary = renderTable({
if(!is.null(input$submit)){
aov.out <- eval(parse(text=paste(
"ezANOVA(data = csvfile()
, dv = .(", input$estimate, ")
, wid = .(", input$sid, ")
, between = .(", input$between1, ")
, within = .(", input$within1, ")
, detailed = TRUE, type = \"III\")")))
aov.out$ANOVA
}
})
})
ui.R
library(shiny)
shinyUI(pageWithSidebar(
headerPanel('Analysis of Variance'),
sidebarPanel(
fileInput("file1", "CSV File", accept=c("text/csv", "text/comma-separated-values,text/plain", ".csv")),
checkboxInput("header", "Header", TRUE),
radioButtons('sep', 'Separator',c(Comma=',',Semicolon=';',Tab='\t', `White Space`=''),''),
uiOutput('var')
),
mainPanel(
tableOutput('aovSummary')
)
)
)
I've changed/fixed a number of smaller issues, but the two most significant changes not related to eval() were:
Including an option for letting R do its usual thing with white-space as a field separater.
Changed the render function to include the actual ANOVA table. ezANOVA returns a list, the first entry of which is always ANOVA and contains the ANOVA table. However, there are sometimes further entries for assumption tests and post-hoc corrections, e.g. Mauchly's Test for Sphericity and Huynh-Feldt correction. You really need to add logic to deal with these when they're present.
Code style is also an issue -- it's better to get rid of empty if blocks followed by a full else and instead just test for the condition where you actually have code to run. Let R "fall off" the end of the function simulate a non existent return value.
I'm assuming UI improvements were waiting for a working example, but you need to consider:
meaningful defaults, perhaps on variable type, for the different arguments and/or not reacting to the radio buttons, instead only reacting to an action button. Otherwise you get confusing errors from ezANOVA while you're setting the values.
what happens if you have pure between or pure within designs?
You might also want to take a look at conditionalPanel() for hiding further options until an initial option (data file) is set in a meaningful way.