I would appreciate any help to specify my brms model below in order to be able to pass multiple columns of weights to the model as illustrated in the stan code below.
I need to do this in brms or stanarm rather than stan directly because I want to use functions of https://github.com/mjskay/tidybayes that are currently not supported by a stanfit object.
#sample data:
dt = read.table(header = TRUE, text = "
n r r/n group treat c2 c1 weights
62 3 0.048387097 1 0 0.1438 1.941115288 1.941115288
96 1 0.010416667 1 0 0.237 1.186583128 1.186583128
17 0 0 0 0 0.2774 1.159882668 3.159882668
41 2 0.048780488 1 0 0.2774 1.159882668 3.159882668
212 170 0.801886792 0 0 0.2093 1.133397521 1.133397521
143 21 0.146853147 1 1 0.1206 1.128993008 1.128993008
143 0 0 1 1 0.1707 1.128993008 2.128993008
143 33 0.230769231 0 1 0.0699 1.128993008 1.128993008
73 62 1.260273973 0 1 0.1351 1.121927228 1.121927228
73 17 0.232876712 0 1 0.1206 1.121927228 1.121927228")
N <- nrow(dt)
n <- dt$n
r <- dt$r
p <- dt$r/n
group <- dt$group
treat <- dt$treat
c1 <- dt$c1
c2 <- dt$c2
w_1 <- dt$weights
w_2 <- dt$weights - 0.01
w_3 <- dt$weights + 0.01/2
w_4 <- dt$weights - 0.01/3
w_5 <- dt$weights + 0.01/4
w_6 <- dt$weights - 0.01/5
w_7 <- dt$weights + 0.01/6
w_8 <- dt$weights + 0.01/7
w_9 <- dt$weights + 0.01/8
w_10 <- dt$weights + 0.01/9
list_bind <- list (N = N,
n = n, r = r, p = p, group = group, treat = treat, c1 = c1, c2 = c2,
weights = cbind(w_1, w_2, w_3, w_4, w_5, w_6, w_7, w_8, w_9, w_10)
)
dt_bind <- as.data.frame(list_bind)
#my attempt:
m <-brm(r | trials(n) + weights(weights.w_1:weights.w_10) ~ treat*c2+(1|group),
data=dt_bind, family=binomial(link=logit))
#stan code:
//this is what I want the brms model specification to be able to do
data {
...
real<lower=0> weights[N, 10]; // data block of model weights
}
model {
...
// likelihood
for (n in 1:N)
for (w in 1:10) {
target += weights[n, w] * binomial_logit_lpmf(Y[n] | trials[n], mu[n]);
}
}
Thanks in advance for any help.
Related
#Generating 99 pairs of datasets
for (i in 1:99) {
set.seed(i)
X1 <- rnorm(100, mean=3, sd=sqrt(10))
X2 <- rnorm(100, mean=-2, sd=sqrt(3))
X1sq <- X1^2
X2sq <- X2^2
X1X2 <- X1*X2
a <- exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
b <- 1 + exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
px3 <- a/b
Y <- rbinom(100, 1, px3)
newtest <- data.frame(X1, X2, Y)
}
Hi guys, essentially I would like to generate 99 pairs of data sets, via the loop function. And each new test should be a data frame consisting of new sets of values of X1, X2 and Y respectively.I am not sure if my loop written above is correct so I would like to seek help if possible!
You can add data frames as list elements to an empty list like this:
n = 99
dfs <- list()
for (i in 1:n) {
set.seed(i)
X1 <- rnorm(100, mean=3, sd=sqrt(10))
X2 <- rnorm(100, mean=-2, sd=sqrt(3))
X1sq <- X1^2
X2sq <- X2^2
X1X2 <- X1*X2
a <- exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
b <- 1 + exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
px3 <- a/b
Y <- rbinom(100, 1, px3)
newtest <- data.frame(X1, X2, Y)
dfs[[i]] <- newtest
}
Output:
> length(dfs)
[1] 99
> dfs[[1]]
X1 X2 Y
1 1.01897911 -3.07450660 1
2 3.58073118 -1.92705317 0
3 0.35751031 -3.57776258 0
4 8.04472084 -1.72628614 1
5 4.04199507 -3.13377386 1
6 0.40545116 1.06103134 0
7 4.54138600 -0.75862624 1
8 5.33478772 -0.42353199 1
9 4.82078051 -1.33457144 1
10 2.03427713 0.91361444 0
11 7.78067182 -3.10112784 1
12 4.23279256 -2.79959213 0
13 1.03546479 0.48078561 0
14 -4.00349598 -3.12703914 1
15 6.55734391 -2.35919398 1
16 2.85790745 -2.68036329 0
17 2.94880189 -2.55424391 0
18 5.98467216 -2.48343842 0
19 5.59692944 -1.14404070 1
20 4.87808088 -2.30714541 1
21 5.90606161 -2.87634403 0
22 5.47333215 0.32621148 1
23 3.23579518 -2.37166244 0
24 -3.29088243 -2.31100103 1
25 4.96006112 -2.17353545 0
26 2.82250534 -0.76562575 1
27 2.50733135 -2.12741729 0
28 -1.65092741 -2.06518430 1
29 1.48795676 -3.18067058 0
30 4.32164726 -2.56165259 0
31 7.29652199 -1.89579906 0
32 2.67495667 -3.01999517 0
33 4.22592528 -1.07942159 1
34 2.82985352 -4.62993570 0
35 -1.35464467 -1.46902621 1
36 1.68767196 -4.66120916 1
37 1.75314569 -2.52130594 1
38 2.81243457 -2.91500764 0
39 6.47858566 -3.12946129 0
40 5.41337362 -2.09854811 1
41 2.47973071 -5.31576779 1
42 2.19880002 0.03790208 1
43 5.20399171 -4.88381685 0
44 4.76032360 -2.80285821 0
45 0.82196325 -3.93283032 1
46 0.76270387 -3.30045666 1
47 4.15290939 1.61507850 1
48 5.43031450 -1.96986990 1
49 2.64473008 -4.22793787 0
50 5.78630728 -4.84161214 0
51 4.25892133 -1.22025307 0
52 1.06460261 -2.03214657 0
53 4.07871518 -2.55091058 0
54 -0.57135969 -3.60970246 1
55 7.53161884 -4.57635683 0
56 9.26257436 -3.86228769 0
57 1.83874373 -0.26789930 1
58 -0.30184360 -3.07606548 0
59 4.80161165 -4.39789764 0
60 2.57291984 1.23770633 1
61 10.59458219 -1.26370455 1
62 2.87591222 -2.41334890 0
63 5.18114738 -0.16665358 1
64 3.08855060 -0.46467093 1
65 0.64956374 -3.07256042 1
66 3.59701367 1.82108156 1
67 -2.70778035 -2.44171977 1
68 7.63449140 -4.46729711 0
69 3.48462961 -2.25010745 0
70 9.87040135 -1.64053305 1
71 4.50369316 1.99753585 1
72 0.75495226 -1.81674492 0
73 4.93128630 -1.20845485 1
74 0.04612393 -2.13363280 1
75 -0.96433690 -2.57850643 1
76 3.92163392 -2.06014725 0
77 1.59818801 -0.63576818 1
78 3.00349543 1.59442979 1
79 3.23508791 -0.22050410 1
80 1.13577108 0.09215872 0
81 1.20171157 -4.13271473 1
82 2.57252769 -0.29584288 1
83 6.72543819 -1.61907907 1
84 -1.81794126 -4.54135160 1
85 4.87822276 -1.09756214 1
86 4.05288152 -2.27497104 0
87 6.36181687 0.53673964 1
88 2.03808597 -3.32689295 0
89 4.17010222 -2.74514862 0
90 3.84464054 -3.60406870 0
91 1.28440103 -2.30675306 0
92 6.81961338 -1.30369517 1
93 6.66951527 -3.26742501 0
94 5.21426998 -0.56175148 1
95 8.01800798 -4.09246077 0
96 4.76608915 -3.81516225 0
97 -1.03693902 0.49615837 0
98 1.18717559 -3.75949942 1
99 -0.87256511 -1.28643887 0
100 1.50297574 -2.66004308 0
Here is a solution with replicate. From the documentation, my emphasis:
replicate is a wrapper for the common use of sapply for repeated evaluation of an expression (which will usually involve random number generation).
set.seed(1)
n <- 99L
dfs <- replicate(n, {
X1 <- rnorm(100, mean=3, sd=sqrt(10))
X2 <- rnorm(100, mean=-2, sd=sqrt(3))
X1sq <- X1^2
X2sq <- X2^2
X1X2 <- X1*X2
a <- exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
b <- 1 + exp(0.2*X1sq + 0.3*X2sq + 0.7*X1X2)
px3 <- a/b
Y <- rbinom(100, 1, px3)
data.frame(X1, X2, Y)
}, simplify = FALSE)
Created on 2022-05-21 by the reprex package (v2.0.1)
I have a data set for different locations, where I measured a response variable at different days. I need to fit a linear model and calculate residuals for each location levels. This is a mock data set for my situation.
#dataframe
loc <- c("Loc1", "Loc2", "Loc3", "Loc4")
day <- as.numeric(c(1, 14, 20, 31, 37, 59))
empty <- expand.grid(loc, day)
empty <- empty %>% arrange(Var1,Var2)
response <- as.numeric(c(4398,NA, 6000.00,9234,11680,12395
,2000,4273,8000,NA,NA,12762
,2300,4000.00,5161,8682,12000.00,13388
,NA,6225,6547,9441,7999,8688))
resp.data <- cbind(empty, response)
names(resp.data) <- c("loc", "day", "response")
Here is what I did.
# run loop to calculate residuals from a linear fit
residuals <- as.data.frame(matrix(nrow = 6, ncol = 4))
for (i in seq_along(unique(resp.data$loc))) {
data_loc <- resp.data %>% filter(loc == unique(resp.data$loc)[i])
model_loc <- lm(data = data_loc,
response ~ day)
temp <- c(resid(model_loc))
if (length(temp)<6){
temp <- c(rep('na',6-length(temp)), temp)
}
residuals[i] <- temp
}
The problem I have is that the observed data have some NAs, and therefore I won't be able to calculate a residual for that particular observation. I came with a solution but if not working because the NA's of the residuals don't match the NA's of the observed data. This is my outcome.
# getting the final dataset with the residuals
residuals <- residuals %>% rename_at(vars(names(residuals)), ~ unique(resp.data$loc)) %>%
gather(key = "loc", value = "res")
resp.data$res <- residuals$res
loc day response res
1 Loc1 1 4398 na
2 Loc1 14 NA 35.7766491917869
3 Loc1 20 6000 -1271.46657929227
4 Loc1 31 9234 278.234709480122
5 Loc1 37 11680 1805.52632153779
6 Loc1 59 12395 -848.071100917431
7 Loc2 1 2000 na
8 Loc2 14 4273 na
9 Loc2 20 8000 -672.182985553773
10 Loc2 31 NA -760.310593900481
11 Loc2 37 NA 1876.93820224719
12 Loc2 59 12762 -444.444622792938
13 Loc3 1 2300 274.745821042281
14 Loc3 14 4000 -806.877089478858
15 Loc3 20 5161 -929.703048180924
16 Loc3 31 8682 237.616027531956
17 Loc3 37 12000 2271.79006882989
18 Loc3 59 13388 -1047.57177974435
19 Loc4 1 NA na
20 Loc4 14 6225 -561.709846254499
21 Loc4 20 6547 -567.168138698069
22 Loc4 31 9441 1726.49165848872
23 Loc4 37 7999 -42.9666339548574
24 Loc4 59 8688 -554.647039581289
Could someone give my some advice here?
Thanks a lot in advance.
1) For each subset perform the regression using na.action = na.exclude, compute their residuals, append them to the subset and put everything back together.
library(dplyr)
resp.data %>%
group_by(loc) %>%
do(mutate(., resid = resid(lm(response ~ day, ., na.action = na.exclude)))) %>%
ungroup
2) or without dplyr:
do.call("rbind", by(resp.data, resp.data$loc, function(x) {
cbind(x, resid = resid(lm(response ~ day, x, na.action = na.exclude)))
}))
3) Another approach is to compoute the residuals and then append them. It works here but it could be a bit more fragile since it assumes that the residual vector computed is in the same order as the input data frame.
reg.list <- by(resp.data, resp.data$loc, lm, formula = response ~ day,
na.action = na.exclude)
transform(resp.data, resid = c(sapply(reg.list, resid)))
I have a problem. Initially I solved it for scenario 1. Scenario 1 is I have a dataframe df. I need to plot mean of the variables that are numeric
df
A B C D E F
1 asd 29 sf 36 sf 44
2 fsd 24 gfd 56 gfd 34
3 gs 46 asd 39 asd 37
4 asd 50 gfg 26 gfg 23
5 sf 43 fg 56 fg 37
6 dfg 29 er 35 er 51
7 sdfg 32 tr 27 tr 28
8 fgdsgd 24 qw 31 qw 36
I have a code to plot the mean of variables that are numeric. The cod e is as shown below
p2 <- list()
cs <- names(Filter(is.numeric, df))
for(i in cs)
{
p2[i] <- mean(df[,i])
do.call(rbind,p2) %>% as.data.frame()
}
p2 <- as.data.frame(p2)
p2 <- unlist(p2)
p2 <- stack(p2)
ggplot(data=p2,aes(x=ind,y=values))+geom_bar(stat =
"identity")+ylab("Mean")
But I need to have another loop. The above scenario is only mean. Now I also need median , Sd and many more. So i have called these function as a vector
gh <- list()
mea <- c("mean","median","sd")
p2 <- list()
cs <- names(Filter(is.numeric, df))
for(i in cs)
{
for(j in mea)
{
p2[i] <- gh[[j]](df[i])
do.call(rbind,p2) %>% as.data.frame()
}
}
p2 <- as.data.frame(p2)
p2 <- unlist(p2)
p2 <- stack(p2)
ggplot(data=p2,aes(x=ind,y=values))+geom_bar(stat =
"identity")+ylab("Mean")
Could you help me in building Scenario 2 that is the above one
So ideally it should plot mean, median and sd for all the variables that are numeric
Here is how I would do it without loops using mtcars data set
apply(mtcars,2,function(x){
if (is.numeric(x)){
return(c("mean"=mean(x),"median"=median(x),"sd"=sd(x)))
}
})
I have some data for multiple location and year
big.data <- data.frame(loc.id = rep(1:3, each = 10*3),
year = rep(rep(1981:1983, each = 10),times = 3),
day = rep(1:10, times = 3*3),
CN = rep(c(50,55,58), each = 10*3),
top.FC = rep(c(72,76,80),each = 10*3),
DC = rep(c(0.02,0.5,0.8), each = 10*3),
WAT0 = rep(c(20,22,26), each = 10*3),
Precp = sample(1:100,90, replace = T),
ETo = sample(1:10,90, replace = T))
I have a function: water.model which uses a second function internally called water.update
water.model <- function(dat){
top.FC <- unique(dat$top.FC)
dat$WAT <- -9.9
dat$RO <- -9.9
dat$DR <- -9.9
dat$WAT[1] <- top.FC/2 # WAT.i is a constant
dat$RO[1] <- NA
dat$DR[1] <- NA
for(d in 1:(nrow(dat)-1)){
dat[d + 1,10:12] <- water.update(WAT0 = dat$WAT[d],
RAIN.i = dat$Precp[d + 1],
ETo.i = dat$ETo[d + 1],
CN = unique(dat$CN),
DC = unique(dat$DC),
top.FC = unique(dat$top.FC))
}
return(dat)
}
water.update <- function(WAT0, RAIN.i, ETo.i, CN, DC, top.FC){
S = 25400/CN - 254; IA = 0.2*S
if (RAIN.i > IA) { RO = (RAIN.i - 0.2 * S)^2/(RAIN.i + 0.8 * S)
} else {
RO = 0
}
if (WAT0 + RAIN.i - RO > top.FC) {
DR = DC * (WAT0 + RAIN.i - RO - top.FC)
} else {
DR = 0
}
dWAT = RAIN.i - RO - DR - ETo.i
WAT1 = WAT0 + dWAT
WAT1 <- ifelse(WAT1 < 0, 0, WAT1)
return(list(WAT1,RO,DR))
}
If I run the above function for a single location X year
big.data.sub <- big.data[big.data$loc.id == 1 & big.data$year == 1981,]
water.model(big.data.sub)
loc.id year day CN top.FC DC WAT0 Precp ETo WAT RO DR
1 1 1981 1 50 72 0.02 20 52 5 36.0000 NA NA
2 1 1981 2 50 72 0.02 20 12 9 39.0000 0.0000000 0.000000
3 1 1981 3 50 72 0.02 20 3 2 40.0000 0.0000000 0.000000
4 1 1981 4 50 72 0.02 20 81 9 107.8750 3.2091485 0.915817
5 1 1981 5 50 72 0.02 20 37 10 133.4175 0.0000000 1.457501
6 1 1981 6 50 72 0.02 20 61 7 184.5833 0.3937926 2.440475
7 1 1981 7 50 72 0.02 20 14 10 186.0516 0.0000000 2.531665
8 1 1981 8 50 72 0.02 20 9 6 186.5906 0.0000000 2.461032
9 1 1981 9 50 72 0.02 20 77 9 248.3579 2.4498216 3.782815
10 1 1981 10 50 72 0.02 20 18 6 256.4708 0.0000000 3.887159
How do I run this for all location and year?
big.data %>% group_by(loc.id, year) %>% # apply my function here.
My final data should look like the above with three new columns called WAT, RO and DR which are generated when the function is run.
We can split the data and apply the water.model by looping over the list with map
library(tidyverse)
split(big.data, big.data[c('loc.id', 'year')], drop = TRUE) %>%
map_df(water.model)
Or apply the function within do after group_by
big.data %>%
group_by(loc.id, year) %>%
do(data.frame(water.model(.)))
I'm trying to generate prediction of the column "dubina" using this algorithm made in R's "neuralnet" package. But I keep getting non-reliable neural-net output. I have tried changing the number of hidden layers, normalizing and denormalizing data. Is there a mistake in the algorithm, maybe because of the activation function being logistic, not sigmoid?
My dataset:
http://www26.zippyshare.com/v/Nc2Sbuny/file.html
Here is the algorithm:
#Data input
podatci<-read.csv("rs5.csv", sep=";", header=T)
#Division into train and test data
smp_size <- floor(0.75 * nrow(podatci))
smp_size
train_ind <- sample(seq_len(nrow(podatci)), size = smp_size)
train <- podatci[train_ind, ]
test <- podatci[-train_ind, ]
train
#Normalization
train_normal<-(train-min(train))/(max(train)-min(train))
test_normal<-(test-min(test))/(max(test)-min(test))
#Training of the network
install.packages('neuralnet')
library("neuralnet")
n <- names(train_normal)
f <- as.formula(paste("dubina ~", paste(n[!n %in% "dubina"], collapse = " + ")))
net_tr <- neuralnet(f,data=train_normal,hidden=c(3,2),linear.output=T)
plot(net_tr)
#TESTing of the network
pr.nn<-compute(net_tr, test_normal[,1:2])
ls(pr.nn)
print(pr.nn$net.result)
#Printing
cleanoutput <- cbind(test_normal, as.data.frame(pr.nn$net.result))
colnames(cleanoutput) <- c("x","y","dubina","Neural Net Output")
print(cleanoutput)
#Denormalization of output data
denorm<-(min(test) + cleanoutput* (max(test)-min(test)))
denorm
Test sample (part of it, there are 743 data which are split 75/25)
x y dubina
6 6451993 5057986 0
7 6451993 5055986 0
15 6447993 5063986 0
17 6447993 5059986 0
25 6445993 5059986 0
27 6445993 5055986 0
28 6443993 5073986 1980
29 6443993 5071986 1910
30 6443993 5069986 1700
Neural-net output:
x y dubina Neural Net Output
6 6451993 5057986 0 3180.43153834
7 6451993 5055986 0 3342.40866485
15 6447993 5063986 0 2564.23694019
17 6447993 5059986 0 2888.11216362
25 6445993 5059986 0 2822.94367093
27 6445993 5055986 0 3146.83988044
28 6443993 5073986 1980 1624.49331419
29 6443993 5071986 1910 1786.36349347
30 6443993 5069986 1700 1948.24298143