simulating data with bayesian network in R using own specification - r

Say I have a simple DAG representing a confounding variable X = Smoking, a treatment T and outcome Y = Death such that:
T ~ X
Y ~ T + X
Is it possible to produce a synthetic dataset of say 1m observations that follows some specified conditional probabilities:
# Pr(smoking):
smoking <- data.frame(
smoking = c(0, 1),
proba = c(0.7, 0.3)
)
# Pr(treatment | smoking):
treatment <- expand.grid(
smoking = c(0, 1),
treatment = c(0, 1)
) %>% arrange(smoking, treatment)
treatment$proba <- c(0.8, 0.2, 0.45, 0.55)
# Pr(death | treatment, smoking):
death <- expand.grid(
treatment = c(0, 1),
smoking = c(0,1),
dead = c(0,1)
) %>%
arrange(treatment, smoking, dead)
death$proba <- c(0.9, 0.1, 0.2, 0.8, 0.89, 0.11, 0.5, 0.5)
I can do this manually here because it's a very basic DAG but I was wondering if it can be done in another more scalable way, using something like bnlearn .
Current solution:
db <- data.frame(
smoking = rbinom(n = 1000000, size = 1, prob = 0.3)
)
db$treatment[db$smoking == 0] <- rbinom(n = sum(db$smoking == 0), size = 1, prob = 0.2)
db$treatment[db$smoking == 1] <- rbinom(n = sum(db$smoking == 1), size = 1, prob = 0.55)
db$dead[db$treatment == 0 & db$smoking == 0] <- rbinom(
n = sum(db$treatment == 0 & db$smoking == 0),
size = 1, prob = 0.1
)
db$dead[db$treatment == 0 & db$smoking == 1] <- rbinom(
n = sum(db$treatment == 0 & db$smoking == 1),
size = 1, prob = 0.8
)
db$dead[db$treatment == 1 & db$smoking == 0] <- rbinom(
n = sum(db$treatment == 1 & db$smoking == 0),
size = 1, prob = 0.11
)
db$dead[db$treatment == 1 & db$smoking == 1] <- rbinom(
n = sum(db$treatment == 1 & db$smoking == 1),
size = 1, prob = 0.5
)

It will be easier to let existing packages do this for you; like bnlearn. You can use custom.fit to specify the DAG and the CPTs and then use rbn to draw samples from it.
An example
library(bnlearn)
# Specify DAG
net <- model2network("[treatment|smoking][smoking][death|treatment:smoking]")
graphviz.plot(net)
# Define CPTs
smoking <- matrix(c(0.7, 0.3), ncol = 2, dimnames = list(NULL, c("no", "yes")))
treatment <- matrix(c(0.8, 0.2, 0.45, 0.55), ncol = 2, dimnames = list(c("no", "yes"), c("no", "yes")))
death <- array(c(0.9, 0.1, 0.2, 0.8, 0.89, 0.11, 0.5, 0.5), c(2,2,2), dimnames=list(c("no", "yes"), c("no", "yes"), c("no", "yes")))
# Build BN
fit <- custom.fit(net, dist = list(smoking = smoking, treatment = treatment, death = death))
# Draw samples
set.seed(69395642)
samples <- rbn(fit, n=1e6)

Related

Customizing ggpairs to make the correlation matrix more readable

The following code creates a correlation matrix visulization that is not very readable:
1) The text is too large and the numbers inside the cells are not readable
2) The ticks in the x and y axes do not offer information because they are too congested
Could you advise me how to deal with these problems:
The code is the following:
library(GGally)
library(ggplot2)
library(data.table)
library(ROSE)
library(dplyr)
#===================================================================================================================
# LOAD THE DATA
#===================================================================================================================
data(hacide)
train <- hacide.train
#=============================================================================================================
# FEATURE EXTRACTION
#=================================================================================================================
setDT(train)
train <- train %>% mutate(
x11 = ifelse(x1 < -1.4, 1, 0),
x12 = ifelse(((x1 >= -1.4) & (x1 < -0.74)), 1, 0),
x13 = ifelse(((x1 >= -0.74) & (x1 < 1)), 1, 0),
x14 = ifelse(x2 >= 1, 1, 0),
x21 = ifelse(x2 < -1.4, 1, 0),
x22 = ifelse(((x2 >= -1.4) & (x2 < -1)), 1, 0),
x23 = ifelse(((x2 >= -1) & (x2 < 0.5)), 1, 0),
x24 = ifelse(x2 >= 0.5, 1, 0),
x3 = x1 ^ 2 - x2
)
#=========================================================================================================
# EXAMINE CORRELATIONS
#=========================================================================================================
ggpairs(train ,
lower = list(continuous = wrap("points", color = "red", alpha = 0.5),
combo = wrap("box", color = "orange", alpha = 0.3),
discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),
diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5) ))
The plot is the following:

mapply / expand.grid () for argument's combination with a condition

My question builds on another one previously posted by someone: mapply for all arguments' combinations [R]
I want to apply a function to multiple arguments using mapply, and this works with my code below. But I want to add a condition such that NOT ALL tmin- and tmax- values will be combined, instead only the first tmin with the first tmax, the second tmin with the second tmax (if tmin == 0.01 & tmax == 0.99 or if tmin == 0.05 & tmax == 0.95, but e.g. tmin == 0.01 should not be combined with tmax == 0.95).
But the first elements of tmin & tmax should be combined with ALL variables, all second elements of tmin & tmax should be combined with ALL variables, etc (as below in the expand.grid() function).
In the end I should have a data frame as the one called "alltogether", but I should have 15 rows with the described condition and not 75 as it is the case now.
I could just filter rows with dplyr::filter afterwards, but is there a nice way to include this condition in the function?
Here an example data frame:
dataframe <- data.frame(personID = 1:10,
Var1 = c(4, 6, 3, 3, 7, 1, 20, NA, 12, 2),
Var2 = c(5, 4, 5, 6, 9, 14, 14, 1, 0, NA),
Var3 = c(NA, 15, 12, 0, NA, NA, 2, 7, 6, 7),
Var4 = c(0, 0, 0, 0, 1, 0, 1, 4, 2, 1),
Var5 = c(12, 15, 11, 10, 10, 15, NA, 10, 13, 11))
and here the code I have so far:
des <- function(var, tmin, tmax){
v <- var[var >= quantile(var, probs = tmin, na.rm = TRUE) &
var <= quantile(var, probs = tmax, na.rm = TRUE)]
d <- psych::describe(v)
df <- cbind(variable = deparse(substitute(var)), tmin = tmin, tmax = tmax, d)
print(df)
}
args = expand.grid(var = dataframe[, c("Var2", "Var4", "Var5")], tmin = c(0.01, 0.05, 0.1, 0.2, 0.25), tmax = c(0.99, 0.95, 0.9, 0.8, 0.75))
alltogether <- do.call("rbind", mapply(FUN = des, var = args$var, tmin = args$tmin, tmax = args$tmax, SIMPLIFY = FALSE))
Thank you for helping!
Edit:
The expected output is the one after filtering the "alltogether"-dataframe with the following code (15 obs. of 16 variables):
alltogether <- alltogether%>%
dplyr::filter((tmin == 0.01 & tmax == 0.99) |
(tmin == 0.05 & tmax == 0.95) |
(tmin == 0.1 & tmax == 0.9) |
(tmin == 0.2 & tmax == 0.8) |
(tmin == 0.25 & tmax == 0.75))
OK, here's a solution to both problems. Unfortunately, I couldn't get one using mapply so I had to rely on a good old for loop (but it's still faster, given that it doesn't have to do all the extra calculations). Also, I changed the function to give you the names of the variables as you wanted. The biggest difference is that I'm not using expand.grid but merge. Finally, it incorporates your comment from above.
des <- function(var, tmin, tmax, cor.var, cor.method = c("spearman", "pearson", "kendall")){
var[var < quantile(var, probs = tmin, na.rm = TRUE) |
var > quantile(var, probs = tmax, na.rm = TRUE)] <- NA
d <- psych::describe(var)
correlation<- cor(cor.var, var, use="pairwise.complete", match.arg(cor.method))
df <- cbind(variable = names(var), tmin = tmin, tmax = tmax, d, correlation)
names(df)[length(names(df))]<- paste0("correlation_with_", names(cor.var))
print(df)
}
minmax = data.frame(tmin = c(0.01, 0.05, 0.1, 0.2, 0.25), tmax = c(0.99, 0.95, 0.9, 0.8, 0.75))
args<- merge(c("Var2", "Var4", "Var5"), minmax)
args[,1]<- as.character(args[,1])
alltogether<- NULL
for (i in 1:nrow(args)){
alltogether<- rbind(alltogether, des(var = dataframe[args[i,1]],
tmin = args[i, 2], tmax=args[i, 3], cor.var = dataframe["Var1"]))
}

How to use deepnet for classification in R

When i use code from example:
library(deepnet)
Var1 <- c(rnorm(50, 1, 0.5), rnorm(50, -0.6, 0.2))
Var2 <- c(rnorm(50, -0.8, 0.2), rnorm(50, 2, 1))
x <- matrix(c(Var1, Var2), nrow = 100, ncol = 2)
y <- c(rep(1, 50), rep(0, 50))
nn <- dbn.dnn.train(x, y, hidden = c(5))
it works. But when i use this code:
Var1 <- c(rnorm(50, 1, 0.5), rnorm(50, -0.6, 0.2))
Var2 <- c(rnorm(50, -0.8, 0.2), rnorm(50, 2, 1))
x <- matrix(c(Var1, Var2), nrow = 100, ncol = 2)
**y <- c(rep("1", 50), rep("0", 50))**
nn <- dbn.dnn.train(x, y, hidden = c(5))
i receive error:
Error in batch_y - nn$post[[i]] : non-numeric argument to binary operator
How can i use deepnet package for classification problem?
y1 <- c(rep("1", 50), rep("0", 50))
lead you to character vector which is not acceptable by the package. so that you get error
class(y)
#[1] "character"
The right y should be numeric as follows
y <- c(rep(1, 50), rep(0, 50))
class(y)
#[1] "numeric"
if you see inside your y , you can find that you have 1 or 0 which is a binary values for classification
> table(y)
#y
# 0 1
#50 50
If you want to train as it is mentioned in the manual, you can do the following to train and predict a test set
Var1 <- c(rnorm(50, 1, 0.5), rnorm(50, -0.6, 0.2))
Var2 <- c(rnorm(50, -0.8, 0.2), rnorm(50, 2, 1))
x <- matrix(c(Var1, Var2), nrow = 100, ncol = 2)
y <- c(rep(1, 50), rep(0, 50))
If you now look at your x and y by str just simply write str(x) or str(y) you can see that they are numeric (to make sure, you can check them by class(x) and class(y).
After having your X and y , then you can build your model
dnn <- dbn.dnn.train(x, y, hidden = c(5, 5))
If you have a test set to predict, then you can predict it using for example as is mentioned in the manual
test_Var1 <- c(rnorm(50, 1, 0.5), rnorm(50, -0.6, 0.2))
test_Var2 <- c(rnorm(50, -0.8, 0.2), rnorm(50, 2, 1))
test_x <- matrix(c(test_Var1, test_Var2), nrow = 100, ncol = 2)
nn.test(dnn, test_x, y)
#[1] 0.25
Again your test_x must be numeric. If your problem is that you have the values as character, then you can convert it to numeric by mydata<- as.numeric()

SVM from e1071 R package replaces labels if there is a feature with only 1 unique value

Why SVM from e1071 package replaces original labels by "1" and "2", if there is at least one such column having only one unique value?
For example, the code below works correctly:
trainData <- data.frame("cA" = c(1, 1, 1, 0.99),
"cB" = c(0.5, 0.6, 0.5, 0.3),
"is_match" = factor(c("N", "N", "P", "P")))
testData <- data.frame("cA" = c(1, 1, 0, 0),
"cB" = c(0.2, 0.3, 0.2, 0.1))
model <- svm(is_match ~ ., data = trainData, type = "C-classification")
pred <- predict(model, testData, type = "class")
print(pred)
it returns
1 2 3 4
P P P P
However, if I change 0.99 to 1 in the first column - so that all values become the same - svm changes labels "N" and "P" to "1" and "2":
trainData <- data.frame("cA" = c(1, 1, 1, 1),
"cB" = c(0.5, 0.6, 0.5, 0.3),
"is_match" = factor(c("N", "N", "P", "P")))
testData <- data.frame("cA" = c(1, 1, 0, 0),
"cB" = c(0.2, 0.3, 0.2, 0.1))
model <- svm(is_match ~ ., data = trainData, type = "C-classification")
pred <- predict(model, testData, type = "class")
print(pred)
Such code returns:
1 2 3 4
2 2 2 2
Additional notes:
It happens with all possible values in column (zeros, NAs) as long as they are all the same for each instance
if labels are digits, svm doesn't replace them
other ML methods like rpart or ada works correctly

Output plot of result to genetic algorithm on knapsack doesn't show

You are going to spend a month in the wilderness. You’re taking a backpack with you, however, the maximum weight it can carry is 20 kilograms. You have a number of survival items available, each with its own number of “survival points”. You’re objective is to maximize the number of survival points.
item survivalpoints weight
pocketknife 10.00 1.00
beans 20.00 5.00
potatoes 15.00 10.00
unions 2.00 1.00
sleeping bag 30.00 7.00
rope 10.00 5.00
compass 30.00 1.00
This is my objective
while am executing it is not showing the output
library(genalg)
library(ggplot2)
library(animation)
dataset <- data.frame(item = c("pocketknife", "beans", "potatoes", "unions",
"sleeping bag", "rope", "compass"), survivalpoints = c(10, 20, 15, 2, 30,
10, 30), weight = c(1, 5, 10, 1, 7, 5, 1))
weightlimit <- 20
chromosome = c(1, 0, 0, 1, 1, 0, 0)
dataset[chromosome == 1, ]
cat(chromosome %*% dataset$survivalpoints)
evalFunc <- function(x) {
current_solution_survivalpoints <- x %*% dataset$survivalpoints
current_solution_weight <- x %*% dataset$weight
if (current_solution_weight > weightlimit)
return(0) else return(-current_solution_survivalpoints)
}
iter = 100
GAmodel <- rbga.bin(size = 7, popSize = 200, iters = iter, mutationChance = 0.01,
elitism = T, evalFunc = evalFunc)
cat(summary.rbga(GAmodel))
solution = c(1, 1, 1, 1, 1, 0, 1)
dataset[solution == 1, ]
cat(paste(solution %*% dataset$survivalpoints, "/", sum(dataset$survivalpoints)))
animate_plot <- function(x) {
for (i in seq(1, iter)) {
temp <- data.frame(Generation = c(seq(1, i), seq(1, i)), Variable = c(rep("mean",
i), rep("best", i)), Survivalpoints = c(-GAmodel$mean[1:i], -GAmodel$best[1:i]))
pl <- ggplot(temp, aes(x = Generation, y = Survivalpoints, group = Variable,
colour = Variable)) + geom_line() + scale_x_continuous(limits = c(0,
iter)) + scale_y_continuous(limits = c(0, 110)) + geom_hline(y = max(temp$Survivalpoints),
lty = 2) + annotate("text", x = 1, y = max(temp$Survivalpoints) +
2, hjust = 0, size = 3, color = "black", label = paste("Best solution:",
max(temp$Survivalpoints))) + scale_colour_brewer(palette = "Set1") +
opts(title = "Evolution Knapsack optimization model")
print(pl)
saveMovie(animate_plot(), interval = 0.1, outdir = getwd())
}}

Resources