R two regressions from one table - r

I am trying to plot two different regression lines (with the formula: salary = beta0 + beta1D3 + beta2spending + beta3*(spending*D3) + w) into one scatter plot by deviding the data I have into two subsets as seen in the following code:
salary = data$salary
spending = data$spending
D1 = data$North
D2 = data$South
D3 = data$West
subsetWest = subset(data, D3 == 1)
subsetRest = subset(data, D3 == 0)
abab = lm(salary ~ 1 + spending + 1*spending, data=subsetWest) #red line
caca = lm(salary ~ 0 + spending + 0*spending, data=subsetRest) #blue line
plot(spending,salary)
points(subsetWest$spending, subsetWest$salary, pch=25, col = "red")
points(subsetRest$spending, subsetRest$salary, pch=10, col = "blue")
abline(abab, col = "red")
abline(caca, col = "blue")
This is a sample of my data table:
And this is the plot I get when running the code:
[enter image description here][2] [2]: https://i.stack.imgur.com/It8ai.png
My problem is that the intercept for my second regression is wrong, in fact I do not even get an intercept when looking at the summary, unlike with the first regression.
Does anybody see where my problem is or does anybody know an alternative way of plotting the two regression lines?
Help would be much appreciated. Thank you very much!
This is the whole table:
structure(list(salary = c(39166L, 40526L, 40650L, 53600L, 58940L,
53220L, 61356L, 54340L, 51706L, 49000L, 48548L, 54340L, 60336L,
53050L, 54720L, 43380L, 43948L, 41632L, 36190L, 41878L, 45288L,
49248L, 54372L, 67980L, 46764L, 41254L, 45590L, 43140L, 44160L,
44500L, 41880L, 43600L, 45868L, 36886L, 39076L, 40920L, 42838L,
50320L, 44964L, 41938L, 54448L, 51784L, 45288L, 49280L, 44682L,
51220L, 52030L, 51576L, 58264L, 51690L), spending = c(6692L,
6228L, 7108L, 9284L, 9338L, 9776L, 11420L, 11072L, 8336L, 7094L,
6318L, 7242L, 7564L, 8494L, 7964L, 7136L, 6310L, 6118L, 5934L,
6570L, 7828L, 9034L, 8698L, 10040L, 7188L, 5642L, 6732L, 5840L,
5960L, 7462L, 5706L, 5066L, 5458L, 4610L, 5284L, 6248L, 5504L,
6858L, 7894L, 5018L, 10880L, 8084L, 6804L, 5658L, 4594L, 5864L,
7410L, 8246L, 7216L, 7532L), North = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), South = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), West = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-50L))

My problem is that the intercept for my second regression is wrong, in fact I do not even get an intercept when looking at the summary, unlike with the first regression.
That is because your second model specifies no intercept, since you use ... ~ 0 + ...
Also, your first model doesn't make sense because it includes spending twice. The second entry for spending will be ignored by lm

Related

system is computationally singular : mlogit

When I am trying to add all variables in formula, I am getting this error. If I omit A then, the model runs fine.
dput(df1)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Choice = c(1L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), B = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 1L, 0L, 0L, -1L, 0L, 0L),
C = c(1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 1L,
0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L), D = c(0L, 1L, 0L, 0L,
-1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), E = c(0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 1L, 0L,
0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, -1L, 0L), F = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, -1L, 0L, 0L, 1L, 0L, 0L, -1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, -1L), Alternative = c(1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L)), row.names = c(NA, -36L), class = "data.frame")
> model = mlogit( Choice ~ A + B + C + D + E + F | 0, data = df1,
+
+ alt.var = 'Alternative',
+
+ shape = "long")
Error in solve.default(H, g[!fixed]) :
system is computationally singular: reciprocal condition number = 4.85723e-17
I have seen these two questions and the documentation however still not able to figure it out completely. Any help will be highly appreciated.
R mlogit model, computationally singular
Error in mlogit: Error in solve.default(H, g[!fixed]) : system is computationally singular: reciprocal condition number = 3.4767e-18

Non-conformable arrays in neural network - neuralnet package

I am trying to create a neural network using this code:
countries=read.table('countries.txt',header =TRUE,sep='\t',quote="",dec=",")
#install.packages("neuralnet")
library(neuralnet)
trainset<-countries[1:85,]
testset<-countries[86:118,]
retea<-neuralnet(Tari.europene~Enrolement_P+Enrolement_S, trainset,
hidden=4, lifesign="minimal", linear.output=FALSE, threshold=0.1)
and I get this error:
hidden: 4 thresh: 0.1 rep: 1/1 steps: Error in x - y : non-conformable arrays
Also , the results for this line are:
dput(trainset[c("Tari.europene", "Enrolement_P", "Enrolement_S")])
structure(list(Tari.europene = c(1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L), Enrolement_P = c(195720L,
4065L, 10172L, 4780105L, 142517L, 327246L, 103806L, 368783L,
773568L, 51978L, 2133330L, 101667L, 1346171L, 161023L, 16630217L,
258840L, 67023L, 2128685L, 4142775L, 2249164L, 1469099L, 4542644L,
120381L, 13534625L, 475731L, 3176874L, 160819L, 763364L, 53129L,
510613L, 467484L, 64319L, 7695L, 1267930L, 2093835L, 11128030L,
777043L, 77215L, 351766L, 4188552L, 285329L, 2862690L, 4117152L,
628753L, 13061L, 2417429L, 1150042L, 324171L, 393020L, 29892L,
29838440L, 7441078L, 536471L, 861699L, 2862666L, 266201L, 6714539L,
979792L, 1122282L, 8158000L, 2736224L, 114623L, 480923L, 366048L,
683977L, 1929L, 108115L, 35435L, 3178364L, 24072L, 592249L, 105447L,
14627368L, 138420L, 239289L, 5705343L, 5177276L, 1578L, 4401780L,
1222867L, 360206L, 30456129L, 108254L, 425917L, 19431565L), Enrolement_S = c(333291L,
4319L, 8077L, 4450741L, 244543L, 697388L, 90092L, 648541L, 1210112L,
37095L, 896763L, 74227L, 1131625L, 297460L, 24224945L, 518914L,
59823L, 2103459L, 2000076L, 2661089L, 1556372L, 4827962L, 70234L,
4388456L, 460235L, 1418361L, 370356L, 830375L, 58634L, 781392L,
553791L, 58553L, 5663L, 931068L, 1942230L, 8208329L, 625060L,
77873L, 536925L, 5947212L, 281739L, 7201072L, 2265692L, 667718L,
9736L, 1165624L, 619832L, 415971L, 857807L, 37530L, 22586955L,
5794537L, 348116L, 767729L, 4596916L, 223920L, 7227485L, 749134L,
1661586L, 7123778L, 3579411L, 121580L, 370359L, 130836L, 222857L,
3387L, 277349L, 46872L, 2846473L, 30230L, 178968L, 133001L, 12993322L,
245773L, 345223L, 1025975L, 3191268L, 1028L, 3163946L, 1573998L,
400562L, 26894959L, 170834L, 439250L, 11286628L)), row.names = c(NA,
85L), class = "data.frame")
I don't have Nan values and the columns have the same shape.
This is my data frame. Tari.europene means if the country is from Europe or not (1= European country; 0=non-European country.) The P and S from Repeaters, Enrolement and Teachers means primary and secondary cycles from system education.

Why does metaMDS() produce a horizontal distribution of our data?

We have a species presence table (so binary: 1=present, 0=absent). When using metaMDS of the vegan package, it produces a horizontal distribution of our data when plotted, instead of clusters.
We tried using different distance methods (Euclidean, Bray, Jaccard), but they all seem to produce the same plot.
myfungi.all looks like this:
structure(list(Sample = 1:12, Habitat = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Dune", "Forest"
), class = "factor"), OTU88 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L), OTU28 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), OTU165 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU178 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L), OTU97 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L
), OTU39 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
OTU104 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L
), OTU95 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L), OTU90 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU119 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU451 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L), OTU98 = c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU45 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
1L), OTU2 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L), OTU24 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), OTU169 = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU29 = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU85 = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L), OTU140 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L), OTU42 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU70 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L), OTU25 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU34 = c(1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L), OTU181 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU201 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU17 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), OTU1146 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU14 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
1L, 1L), OTU72 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU13 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 1L), OTU20 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L), OTU63 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU170 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU262 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU48 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU6 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L), OTU3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L), OTU31 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU73 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 0L), OTU32 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU37 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU196 = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU5 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU11 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 1L), OTU16 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU41 = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU71 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), OTU109 = c(0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), OTU233 = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L)), class = "data.frame", row.names = c(NA, -12L))
Our script looks like this:
myfungi.all = read.csv("soil_fungi.csv",header=T)
myfungi = myfungi.all[,c(3:51)]
myfungi.nmds.bc <- metaMDS(myfungi, distance = "bray", k = 2, binary = TRUE)
plot(myfungi.nmds.bc, type="t", main=paste("NMDS/Bray-Curtis -?? Stress =", round(myfungi.nmds.bc$stress,10)))
Does anyone have suggestions as what seems to be the problem?
At the moment our plot looks like this:
The solution you reported gives a perfect fit (stress nearly 0), and also gives a warning because of this dubious stress. The solution effectively puts your sampling units into two points so that you have absolutely dichotomous data. As Ben Bolker demonstrated, Principal Coordinates Analysis, PCoA (which you also can perform with stats::cmdscale, vegan::wcmdscale or vegan::dbrda) still has points in two major cluster, but spreads points within these clusters. PCoA is a linear method, but NMDS is non-linear and therefore often needs more data. It seems that in this case the weak ties (read the documentation ?monoMDS or Kruskal's papers cited in that documentation) is the stage that puts most demand on the data, and setting weakties = FALSE will prevent collapsing non-identical observations into two points:
m3 <- metaMDS(myfungi, weakties = FALSE)
m3 # stress 0.04124
stressplot(m3) # compare this to your result stressplot(myfungi.nmds.bc)
plot(m3)
The default monoMDS with weakties = TRUE (like Kruskal recommended) will consider the dichotomy of two groups as the only important non-linear difference, but with weakties = FALSE the solutions cannot proceed to zero stress. You still have a dichotomy, but with scatter.
Best guess is that you simply don't have enough data to distinguish two separate environmental axes: when I run your code I get
Warning message: In metaMDS(myfungi[, -(1:2)], distance = "bray", k = 2, binary = TRUE) : stress is (nearly) zero: you may have insufficient data
Out of your 53 species, only 35 are informative (the others appear either at none or at all of the sites):
m2 <- myfungi[,apply(myfungi,2,var)>0]
ncol(m2) ## 35
vv <- function(x) (image(Matrix(as.matrix(x))))
How many distinct distribution patterns are there?
nrow(unique(t(m2))) ## 27
You could try PCoA instead:
library(ape)
biplot(pcoa(vegdist(m2,"bray"))
As Jari Oksanen points out, you could also do this with cmdscale() in base R:
plot(cmdscale(vegdist(mm,"bray")),
col=as.numeric(myfungi$Habitat))

Conditional grouped barplot R

I am trying to make a barplot in R for two categorical variables, Dep_meds_at_time_of_rx_2 and phq9_cat. phq9_cat has two levels, 0 and 1, where 0 corresponds to PHQ-L and 1 corresponds to PHQ-H.
Here is my code:
# get counts of vars
counts <- table(data2$Dep_meds_at_time_of_rx_2, data2$phq9_cat)
# get percentages of vars
pcnts <- scale(counts, FALSE, colSums(counts))*100
# plot barplot
bp <- barplot(pcnts, beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)", border=NA)
legend("topright", legend=c("PHQ-L", "PHQ-H"), bty="n", fill=c("azure3", "azure4"), border=NA)
text(bp, 1, round(pcnts, 2), cex=1, pos=3, col=c("black"))
And the resulting plot:
Which is great! But I need to only plot the data2$Dep_meds_at_time_of_rx_2==1 category. So I would like a barplot with only the 3.03 bar and the 19.44 bar.
I've exhausted any clever tricks that I know of already such as making the data2$Dep_meds_at_time_of_rx_2==0 bars white and using space = c(-1, 0) to make the data2$Dep_meds_at_time_of_rx_2==1 bars next to one another but then the bars are super wide, like so:
I just need the data2$Dep_meds_at_time_of_rx_2==1 columns, but at a normal width.
Any ideas?
Here is my data:
> dput(data2)
structure(list(Dep_meds_at_time_of_rx_2 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), phq9_cat = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), .Names = c("Dep_meds_at_time_of_rx_2", "phq9_cat"), row.names = c(NA,
-243L), class = "data.frame")
Here is a minor revision of what you have. I think it gets close to accomplishing what you want:
bp <- barplot(pcnts[2,], beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)",
border=NA)
legend("topleft", legend=c("PHQ-L", "PHQ-H"), bty="n", fill=c("azure3", "azure4"), border=NA)
text(bp[], 1, round(pcnts[2,], 2), cex=1, pos=3, col=c("black"))
Note that if it is desired to drop the "0" "1" labels on the x axis, you can accomplish this by replacing pcnts[2, ] with unname(pcnts) in the first line:
bp <- barplot(unname(pcnts[2, ]), beside=TRUE, col=c("azure3", "azure4"),
ylab="Frequency (%)", border=NA)
I thought I'd throw in a ggplot2 answer. This solution ensures that both of the labels on the x-axis are 1 - reflecting the status of Anti-depressant use:
library(ggplot2)
df1 <- data.frame(Frequency = pcnts[2,],
PHQ = c('PHQ-L','PHQ-H'))
ggplot(df1, aes(x = 1, y = Frequency))+
geom_bar(stat = 'identity', aes(fill = PHQ),
position = position_dodge(width = 1))+
scale_fill_manual(values = c('PHQ-L' = 'azure3',
'PHQ-H' = 'azure4'),
name = '')+
scale_x_continuous(breaks = c(.75, 1.25),
labels = c(1,1))+
xlab('Anti-Depressant use at time of treatment')+
ylab('Frequency (%)')+
geom_text(x = .75, y = 2.5, label = '19.44%')+
geom_text(x = 1.25, y = 2.5, label = '3.03%')+
theme_bw()
You just need to select the second row of your percentages table, e.g.
# get percentages of vars
pcnts <- scale(counts, FALSE, colSums(counts))*100
# Filter for the results you want
pcnts <- pcnts[2, ]
# Plot as before
If you want to achieve narrow bars then a combination of width and space arguments will do the trick.
barplot(pcnts, beside=TRUE, col=c("azure3", "azure4"), ylab="Frequency (%)", border=NA, width = c(0,.51, 0,0.51), space = c(1,2))
or you can change the colour to white
barplot(pcnts, beside=TRUE, col=c("white", "azure4"), ylab="Frequency (%)", border=NA, space = c(1,2))

barplot(): Frequency percentages per group

I'm trying to make a grouped barplot with frequency (%) on the y-axis and depression_meds (N/Y) on the x-axis, grouped by another variable score (LOW/HIGH).
My code so far:
meds <- table(data2$depression_meds,data2$score)/sum(table(data2$score)) * 100
bp <- barplot(meds, beside=TRUE, axes=FALSE, xlab="Anti-depression meds use", names=c("No", "Yes"), col=c("azure3", "azure"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("LOW", "HIGH"), bty="n", fill=c("azure3", "azure"))
text(bp, 0, round(medtimerx, 1), cex=1, pos=3)
Which is great and makes the following barplot:
But the percentages are using the total n of 243 (3rd column of the table below), not the n per score group (1st and 2nd columns in table below), which makes sense because that is what I do when I divide by the sum. But that's not what I want. I keep trying to get the frequencies per score group so that the four bars match the 1st and 2nd columns below, but I have run out of ideas. Does anyone have any suggestions?
Depression meds (0=N, 1=Y) LOW (N=99) HIGH (N=144) TOTAL (N=243)
0 96 (97.0%) 116 (80.6%) 212 (87.2%)
1 3 (3.0%) 28 (19.4%) 31 (12.8%)
Here is my data:
> dput(data2)
structure(list(depression_meds = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), score = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), .Names = c("depression_meds", "score"), row.names = c(NA,
-243L), class = "data.frame")
Thanks for your help, my brain isn't working any longer.
Using prop.table in this case is very suitable, which provides a margin parameter to specify by row or by column probability calculation:
meds <- prop.table(table(data2), margin = 2) * 100
meds
# score
# depression_meds 0 1
# 0 96.969697 86.111111
# 1 3.030303 13.888889
Use this for your summary table:
meds <- table(data2)
# score
#depression_meds 0 1
# 0 96 124
# 1 3 20
meds <- scale(meds, FALSE, colSums(meds)) * 100
# score
#depression_meds 0 1
# 0 96.969697 86.111111
# 1 3.030303 13.888889
No need to change your the rest of your code:
bp <- barplot(meds, beside=TRUE, axes=FALSE, xlab="Anti-depression meds use", names=c("No", "Yes"), col=c("azure3", "azure"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("LOW", "HIGH"), bty="n", fill=c("azure3", "azure"))
text(bp, 0, round(meds, 1), cex=1, pos=3)

Resources