R:for loop execution time? - r

Is there a way to approximate how long a for loop will take to run? I have a loop with about 500,000 iterations which does some basic calculations and its been running for a while now. I'm skeptical that it might be a never-ending loop.
Here is the code:
mod<- function(file, level = 5){
df<- read.csv(file = file,header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10","C","Price","Qty","OldPrice","OldQty"))
df<- df[which(df$DateTime != 0),]
df$DateTime= as.POSIXct(df$DateTime/(10^9), origin="1970-01-01") #timestamp conversion
change = c()
for(i in 2:nrow(df)){
if(is.na(df[i,6]) == TRUE){
change[i] = 0
next
} else if(is.na(df[i,63]) == TRUE){
change[i] = 0
next
}
#browser()
if(df[i,63] == "N"){
a = which(df[i,] == df[i,64])
if(a[1] > 32){
change[i] = 0
} else if(a[1] < 32){
change[i] = a[1]
}
change
}
#browser()
if(df[i,63] == "C"){
a = which(df[i,] == df[i,64])
if(a[1] > 32){
change[i] = 0
}else if(a[1] < 32){
change[i] = a[1]*-1
}
change
}
#browser()
if(df[i,63] == "M"){
a = which(df[i,] == df[i,64])
b = which(df[i-1,] == df[i,66])
if(a[1] > 32 & b[1] > 32){
change[i] = 0
} else if(a[1] < 32 & b[1] > 32){
change[i] = a[1]
} else if(a[1] < 32 & b[1] < 32){
change[i] = b[1] - a[1]
}
#browser()
change
}
change
}
change
}
What I am trying to do is first see what column 63("C") says, if it is "N" or "C" then look at column 64("Price") and locate its position in that row, apart from column 64 itself, and then assign the column number to
change[i]. Make it negative if col63 was "C" and positive if col63 was "N"
If column 63("C") says "M" then look at column 66("OldPrice") first and locate it in the previous row i-1. Then locate the price in column 64("Price") in the same row and take the difference between them(the column numbers) and assign it to change[i]
So the output should be a vector of negative or positive integers.
> dput(df[1:20,])
structure(list(DateTime = c(1.448855100369e+18, 1.448855100369e+18,
1.448855100375e+18, 1.448855100376e+18, 1.448855100378e+18, 1.448855100379e+18,
1.44885510038e+18, 1.44885510038e+18, 1.44885510038e+18, 1.448855100383e+18,
1.448855100384e+18, 1.448855100385e+18, 1.448855100385e+18, 1.448855100385e+18,
1.448855100386e+18, 1.448855100386e+18, 1.448855100386e+18, 1.448855100387e+18,
1.448855100389e+18, 1.448855100389e+18), Seq = c(92L, 108L, 406L,
479L, 643L, 722L, 811L, 822L, 828L, 1046L, 1103L, 1171L, 1186L,
1196L, 1238L, 1249L, 1254L, 1273L, 1333L, 1343L), BP1 = c(80830L,
80830L, 81100L, 81100L, 81100L, 81100L, 81100L, 81100L, 81100L,
81100L, 81100L, 81100L, 81100L, 81100L, 81100L, 81100L, 81200L,
81200L, 81200L, 81200L), BQ1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AP1 = c(0L, 83435L, 83435L, 82165L, 82165L, 82165L,
82165L, 82165L, 82345L, 82345L, 82165L, 82345L, 82345L, 82165L,
82340L, 82340L, 82340L, 82340L, 82165L, 82340L), AQ1 = c(0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AO1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP2 = c(0L, 0L, 80830L,
80830L, 80830L, 80830L, 80830L, 80835L, 80835L, 80835L, 80835L,
80835L, 80835L, 80835L, 80835L, 80835L, 81100L, 81100L, 81100L,
81100L), BQ2 = c(0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP2 = c(0L, 0L, 0L, 83435L, 83200L, 82650L, 82650L, 82650L,
82650L, 82650L, 82650L, 82650L, 82650L, 82650L, 82650L, 82650L,
82650L, 82650L, 82650L, 82650L), AQ2 = c(0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
AO2 = c(0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BP3 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 80830L, 80830L, 80830L, 80830L, 80830L, 80830L, 80830L,
80830L, 80830L, 80835L, 80835L, 80835L, 80835L), BQ3 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L,
0L, 0L, 0L, 83435L, 83200L, 83200L, 83200L, 83200L, 83200L,
83200L, 83200L, 82900L, 82900L, 82900L, 82900L, 82900L, 82900L,
82900L, 82900L), AQ3 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AO3 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80830L, 80830L, 80830L, 80830L
), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), BO4 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L,
1L), AP4 = c(0L, 0L, 0L, 0L, 0L, 83435L, 83430L, 83430L,
83430L, 83430L, 83430L, 83430L, 83200L, 83200L, 83200L, 83200L,
83200L, 83200L, 83200L, 83200L), AQ4 = c(0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), AO4 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L,
80035L, 80035L), BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), BO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 83435L,
83435L, 83435L, 83435L, 83435L, 83435L, 83430L, 83430L, 83430L,
83430L, 83430L, 83430L, 83430L, 83430L), AQ5 = c(0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AO5 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP6 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BQ6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), BO6 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), AP6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 83500L, 83500L, 83500L, 83435L, 83435L, 83435L, 83435L,
83435L, 83435L, 83435L, 83435L), AQ6 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), AO6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP7 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), BQ7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), BO7 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), AP7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 83500L, 83500L, 83500L, 83500L, 83500L, 83500L,
83500L, 83500L), AQ7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AO7 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), BP8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), BQ8 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BO8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AP8 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), AQ8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AO8 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BP9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), BQ9 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BO9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AP9 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), AQ9 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AO9 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BP10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), BQ10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), BO10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AP10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), AQ10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AO10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), C = structure(c(4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 4L, 3L, 3L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 3L), .Label = c("",
"C", "M", "N"), class = "factor"), Price = c(80830L, 83435L,
81100L, 82165L, 83200L, 82650L, 83430L, 80835L, 82345L, 83500L,
82165L, 82345L, 82900L, 82165L, 82340L, 83200L, 81200L, 80035L,
82165L, 82340L), Qty = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), OldPrice = c(NA,
NA, NA, NA, NA, NA, NA, NA, 82165L, NA, 82345L, 82165L, NA,
82345L, 82165L, NA, NA, NA, 82340L, 82165L), OldQty = c(NA,
NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, 1L, NA, 1L, 1L, NA,
NA, NA, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5", "BP6", "BQ6", "BO6", "AP6", "AQ6", "AO6", "BP7", "BQ7",
"BO7", "AP7", "AQ7", "AO7", "BP8", "BQ8", "BO8", "AP8", "AQ8",
"AO8", "BP9", "BQ9", "BO9", "AP9", "AQ9", "AO9", "BP10", "BQ10",
"BO10", "AP10", "AQ10", "AO10", "C", "Price", "Qty", "OldPrice",
"OldQty"), row.names = c(NA, 20L), class = "data.frame")

Here is how I would do this. The only loop needed is to apply which, which should be fast:
#find column matches for price
DF$change <- apply(DF[, 3:62] == DF[,64], 1, which) + 2L
#negative for C
DF$change[DF[,63] == "C"] <- DF$change[DF[,63] == "C"] * (-1)
#column matches for old price in preceding row if M
pos2 <- apply(DF[which(DF[,63] == "M") - 1, 3:62] == DF[DF[,63] == "M",66], 1, which) + 2L
#assign the difference
DF$change[DF[,63] == "M"] <- pos2 - DF$change[DF[,63] == "M"]
DF$change
#[1] 3 6 3 6 12 12 24 9 0 36 0 0 18 0 0 24 3 27 0 0
This assumes that there is always a matching column. If that's not the case wrap which in a function that returns NA if which returns integer(0).

Related

system is computationally singular : mlogit

When I am trying to add all variables in formula, I am getting this error. If I omit A then, the model runs fine.
dput(df1)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Choice = c(1L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), B = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 1L, 0L, 0L, -1L, 0L, 0L),
C = c(1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 1L,
0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L), D = c(0L, 1L, 0L, 0L,
-1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), E = c(0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L,
0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L), F = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, -1L), Alternative = c(1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L)), row.names = c(NA, -36L), class = "data.frame")
> model = mlogit( Choice ~ A + B + C + D + E + F | 0, data = df1,
+
+ alt.var = 'Alternative',
+
+ shape = "long")
Error in solve.default(H, g[!fixed]) :
system is computationally singular: reciprocal condition number = 4.85723e-17
I have seen these two questions and the documentation however still not able to figure it out completely. Any help will be highly appreciated.
R mlogit model, computationally singular
Error in mlogit: Error in solve.default(H, g[!fixed]) : system is computationally singular: reciprocal condition number = 3.4767e-18

Non-conformable arrays in neural network - neuralnet package

I am trying to create a neural network using this code:
countries=read.table('countries.txt',header =TRUE,sep='\t',quote="",dec=",")
#install.packages("neuralnet")
library(neuralnet)
trainset<-countries[1:85,]
testset<-countries[86:118,]
retea<-neuralnet(Tari.europene~Enrolement_P+Enrolement_S, trainset,
hidden=4, lifesign="minimal", linear.output=FALSE, threshold=0.1)
and I get this error:
hidden: 4 thresh: 0.1 rep: 1/1 steps: Error in x - y : non-conformable arrays
Also , the results for this line are:
dput(trainset[c("Tari.europene", "Enrolement_P", "Enrolement_S")])
structure(list(Tari.europene = c(1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L), Enrolement_P = c(195720L,
4065L, 10172L, 4780105L, 142517L, 327246L, 103806L, 368783L,
773568L, 51978L, 2133330L, 101667L, 1346171L, 161023L, 16630217L,
258840L, 67023L, 2128685L, 4142775L, 2249164L, 1469099L, 4542644L,
120381L, 13534625L, 475731L, 3176874L, 160819L, 763364L, 53129L,
510613L, 467484L, 64319L, 7695L, 1267930L, 2093835L, 11128030L,
777043L, 77215L, 351766L, 4188552L, 285329L, 2862690L, 4117152L,
628753L, 13061L, 2417429L, 1150042L, 324171L, 393020L, 29892L,
29838440L, 7441078L, 536471L, 861699L, 2862666L, 266201L, 6714539L,
979792L, 1122282L, 8158000L, 2736224L, 114623L, 480923L, 366048L,
683977L, 1929L, 108115L, 35435L, 3178364L, 24072L, 592249L, 105447L,
14627368L, 138420L, 239289L, 5705343L, 5177276L, 1578L, 4401780L,
1222867L, 360206L, 30456129L, 108254L, 425917L, 19431565L), Enrolement_S = c(333291L,
4319L, 8077L, 4450741L, 244543L, 697388L, 90092L, 648541L, 1210112L,
37095L, 896763L, 74227L, 1131625L, 297460L, 24224945L, 518914L,
59823L, 2103459L, 2000076L, 2661089L, 1556372L, 4827962L, 70234L,
4388456L, 460235L, 1418361L, 370356L, 830375L, 58634L, 781392L,
553791L, 58553L, 5663L, 931068L, 1942230L, 8208329L, 625060L,
77873L, 536925L, 5947212L, 281739L, 7201072L, 2265692L, 667718L,
9736L, 1165624L, 619832L, 415971L, 857807L, 37530L, 22586955L,
5794537L, 348116L, 767729L, 4596916L, 223920L, 7227485L, 749134L,
1661586L, 7123778L, 3579411L, 121580L, 370359L, 130836L, 222857L,
3387L, 277349L, 46872L, 2846473L, 30230L, 178968L, 133001L, 12993322L,
245773L, 345223L, 1025975L, 3191268L, 1028L, 3163946L, 1573998L,
400562L, 26894959L, 170834L, 439250L, 11286628L)), row.names = c(NA,
85L), class = "data.frame")
I don't have Nan values and the columns have the same shape.
This is my data frame. Tari.europene means if the country is from Europe or not (1= European country; 0=non-European country.) The P and S from Repeaters, Enrolement and Teachers means primary and secondary cycles from system education.

Why does metaMDS() produce a horizontal distribution of our data?

We have a species presence table (so binary: 1=present, 0=absent). When using metaMDS of the vegan package, it produces a horizontal distribution of our data when plotted, instead of clusters.
We tried using different distance methods (Euclidean, Bray, Jaccard), but they all seem to produce the same plot.
myfungi.all looks like this:
structure(list(Sample = 1:12, Habitat = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Dune", "Forest"
), class = "factor"), OTU88 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L), OTU28 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), OTU165 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU178 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L), OTU97 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L
), OTU39 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
OTU104 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L
), OTU95 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L), OTU90 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU119 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU451 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L), OTU98 = c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU45 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
1L), OTU2 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L), OTU24 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), OTU169 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU29 = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU85 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), OTU140 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L), OTU42 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU70 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU25 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU34 = c(1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L), OTU181 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU201 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU17 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU1146 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU14 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
1L, 1L), OTU72 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 1L), OTU20 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU63 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU170 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU262 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU48 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU6 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L), OTU31 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU73 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 0L), OTU32 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU37 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU196 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU5 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU11 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 1L), OTU16 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU41 = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU71 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU109 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU233 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L)), class = "data.frame", row.names = c(NA, -12L))
Our script looks like this:
myfungi.all = read.csv("soil_fungi.csv",header=T)
myfungi = myfungi.all[,c(3:51)]
myfungi.nmds.bc <- metaMDS(myfungi, distance = "bray", k = 2, binary = TRUE)
plot(myfungi.nmds.bc, type="t", main=paste("NMDS/Bray-Curtis -?? Stress =", round(myfungi.nmds.bc$stress,10)))
Does anyone have suggestions as what seems to be the problem?
At the moment our plot looks like this:
The solution you reported gives a perfect fit (stress nearly 0), and also gives a warning because of this dubious stress. The solution effectively puts your sampling units into two points so that you have absolutely dichotomous data. As Ben Bolker demonstrated, Principal Coordinates Analysis, PCoA (which you also can perform with stats::cmdscale, vegan::wcmdscale or vegan::dbrda) still has points in two major cluster, but spreads points within these clusters. PCoA is a linear method, but NMDS is non-linear and therefore often needs more data. It seems that in this case the weak ties (read the documentation ?monoMDS or Kruskal's papers cited in that documentation) is the stage that puts most demand on the data, and setting weakties = FALSE will prevent collapsing non-identical observations into two points:
m3 <- metaMDS(myfungi, weakties = FALSE)
m3 # stress 0.04124
stressplot(m3) # compare this to your result stressplot(myfungi.nmds.bc)
plot(m3)
The default monoMDS with weakties = TRUE (like Kruskal recommended) will consider the dichotomy of two groups as the only important non-linear difference, but with weakties = FALSE the solutions cannot proceed to zero stress. You still have a dichotomy, but with scatter.
Best guess is that you simply don't have enough data to distinguish two separate environmental axes: when I run your code I get
Warning message: In metaMDS(myfungi[, -(1:2)], distance = "bray", k = 2, binary = TRUE) : stress is (nearly) zero: you may have insufficient data
Out of your 53 species, only 35 are informative (the others appear either at none or at all of the sites):
m2 <- myfungi[,apply(myfungi,2,var)>0]
ncol(m2) ## 35
vv <- function(x) (image(Matrix(as.matrix(x))))
How many distinct distribution patterns are there?
nrow(unique(t(m2))) ## 27
You could try PCoA instead:
library(ape)
biplot(pcoa(vegdist(m2,"bray"))
As Jari Oksanen points out, you could also do this with cmdscale() in base R:
plot(cmdscale(vegdist(mm,"bray")),
col=as.numeric(myfungi$Habitat))

Conditional grouped barplot R

I am trying to make a barplot in R for two categorical variables, Dep_meds_at_time_of_rx_2 and phq9_cat. phq9_cat has two levels, 0 and 1, where 0 corresponds to PHQ-L and 1 corresponds to PHQ-H.
Here is my code:
# get counts of vars
counts <- table(data2$Dep_meds_at_time_of_rx_2, data2$phq9_cat)
# get percentages of vars
pcnts <- scale(counts, FALSE, colSums(counts))*100
# plot barplot
bp <- barplot(pcnts, beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)", border=NA)
legend("topright", legend=c("PHQ-L", "PHQ-H"), bty="n", fill=c("azure3", "azure4"), border=NA)
text(bp, 1, round(pcnts, 2), cex=1, pos=3, col=c("black"))
And the resulting plot:
Which is great! But I need to only plot the data2$Dep_meds_at_time_of_rx_2==1 category. So I would like a barplot with only the 3.03 bar and the 19.44 bar.
I've exhausted any clever tricks that I know of already such as making the data2$Dep_meds_at_time_of_rx_2==0 bars white and using space = c(-1, 0) to make the data2$Dep_meds_at_time_of_rx_2==1 bars next to one another but then the bars are super wide, like so:
I just need the data2$Dep_meds_at_time_of_rx_2==1 columns, but at a normal width.
Any ideas?
Here is my data:
> dput(data2)
structure(list(Dep_meds_at_time_of_rx_2 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), phq9_cat = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), .Names = c("Dep_meds_at_time_of_rx_2", "phq9_cat"), row.names = c(NA,
-243L), class = "data.frame")
Here is a minor revision of what you have. I think it gets close to accomplishing what you want:
bp <- barplot(pcnts[2,], beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)",
border=NA)
legend("topleft", legend=c("PHQ-L", "PHQ-H"), bty="n", fill=c("azure3", "azure4"), border=NA)
text(bp[], 1, round(pcnts[2,], 2), cex=1, pos=3, col=c("black"))
Note that if it is desired to drop the "0" "1" labels on the x axis, you can accomplish this by replacing pcnts[2, ] with unname(pcnts) in the first line:
bp <- barplot(unname(pcnts[2, ]), beside=TRUE, col=c("azure3", "azure4"),
ylab="Frequency (%)", border=NA)
I thought I'd throw in a ggplot2 answer. This solution ensures that both of the labels on the x-axis are 1 - reflecting the status of Anti-depressant use:
library(ggplot2)
df1 <- data.frame(Frequency = pcnts[2,],
PHQ = c('PHQ-L','PHQ-H'))
ggplot(df1, aes(x = 1, y = Frequency))+
geom_bar(stat = 'identity', aes(fill = PHQ),
position = position_dodge(width = 1))+
scale_fill_manual(values = c('PHQ-L' = 'azure3',
'PHQ-H' = 'azure4'),
name = '')+
scale_x_continuous(breaks = c(.75, 1.25),
labels = c(1,1))+
xlab('Anti-Depressant use at time of treatment')+
ylab('Frequency (%)')+
geom_text(x = .75, y = 2.5, label = '19.44%')+
geom_text(x = 1.25, y = 2.5, label = '3.03%')+
theme_bw()
You just need to select the second row of your percentages table, e.g.
# get percentages of vars
pcnts <- scale(counts, FALSE, colSums(counts))*100
# Filter for the results you want
pcnts <- pcnts[2, ]
# Plot as before
If you want to achieve narrow bars then a combination of width and space arguments will do the trick.
barplot(pcnts, beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)", border=NA, width = c(0,.51, 0,0.51), space = c(1,2))
or you can change the colour to white
barplot(pcnts, beside=TRUE, col=c("white", "azure4"), ylab="Frequency (%)", border=NA, space = c(1,2))

barplot(): Frequency percentages per group

I'm trying to make a grouped barplot with frequency (%) on the y-axis and depression_meds (N/Y) on the x-axis, grouped by another variable score (LOW/HIGH).
My code so far:
meds <- table(data2$depression_meds,data2$score)/sum(table(data2$score)) * 100
bp <- barplot(meds, beside=TRUE, axes=FALSE, xlab="Anti-depression meds use", names=c("No", "Yes"), col=c("azure3", "azure"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("LOW", "HIGH"), bty="n", fill=c("azure3", "azure"))
text(bp, 0, round(medtimerx, 1), cex=1, pos=3)
Which is great and makes the following barplot:
But the percentages are using the total n of 243 (3rd column of the table below), not the n per score group (1st and 2nd columns in table below), which makes sense because that is what I do when I divide by the sum. But that's not what I want. I keep trying to get the frequencies per score group so that the four bars match the 1st and 2nd columns below, but I have run out of ideas. Does anyone have any suggestions?
Depression meds (0=N, 1=Y) LOW (N=99) HIGH (N=144) TOTAL (N=243)
0 96 (97.0%) 116 (80.6%) 212 (87.2%)
1 3 (3.0%) 28 (19.4%) 31 (12.8%)
Here is my data:
> dput(data2)
structure(list(depression_meds = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), score = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), .Names = c("depression_meds", "score"), row.names = c(NA,
-243L), class = "data.frame")
Thanks for your help, my brain isn't working any longer.
Using prop.table in this case is very suitable, which provides a margin parameter to specify by row or by column probability calculation:
meds <- prop.table(table(data2), margin = 2) * 100
meds
# score
# depression_meds 0 1
# 0 96.969697 86.111111
# 1 3.030303 13.888889
Use this for your summary table:
meds <- table(data2)
# score
#depression_meds 0 1
# 0 96 124
# 1 3 20
meds <- scale(meds, FALSE, colSums(meds)) * 100
# score
#depression_meds 0 1
# 0 96.969697 86.111111
# 1 3.030303 13.888889
No need to change your the rest of your code:
bp <- barplot(meds, beside=TRUE, axes=FALSE, xlab="Anti-depression meds use", names=c("No", "Yes"), col=c("azure3", "azure"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("LOW", "HIGH"), bty="n", fill=c("azure3", "azure"))
text(bp, 0, round(meds, 1), cex=1, pos=3)

Resources