Negative adjusted R-squared of fixed effect plm function - r

I am currently writing my thesis and when I use the plm (from package plm) function (pooled, within or random) I get a very low R-squared and negative adjusted R-squared. However, when I run the lm() function with dummy variables, the adjusted R-squared is around 0.4. What am I doing wrong?
This is the first three rows of my data:
structure(list(table.id = c("alameda, ca, 2003", "alameda, ca, 2004",
"alameda, ca, 2005"), location = c("Alameda, CA", "Alameda, CA",
"Alameda, CA"), year = c(2003, 2004, 2005), search.fund = c(0,
0, 0), search.fund.binary = c(0, 0, 0), time.avg = c(0, 0, 0),
distance.avg = c(0, 0, 0), avg.income.capita = c(40266, 41973,
43594), real.gdp = c(86355025, 88443534, 90705419), unemployment = c(6.8,
5.9, 5.1), education.rate = c(34.9, 34.9, 34.9), urban.id = c(1,
1, 1), no.establishments = c(46548.75, 46623.5, 46254.25),
no.building.permit = c(14828, 15239, 14883), population.size = c(1454163,
1445721, 1441545), no.establishments.capita = c(0.0320106824338124,
0.0322493067472908, 0.0320865807172166), no.building.permit.capita = c(0.0101969311555857,
0.0105407613225512, 0.0103243395107333)), row.names = c(NA,
3L), class = "data.frame")
This is my lm model:
sf.lm.fe.nb <- lm(search.fund ~ education.rate + unemployment +
urban.id + no.establishments.capita + no.building.permit.capita +
factor(location) + factor(year), data = df)
summary(sf.lm.fe.nb)
and this is my plm model:
sf.plm.fe.nb <- plm(search.fund ~ education.rate + unemployment +
urban.id + no.establishments.capita + no.building.permit.capita,
data = df, model = "within", effect = "twoways",
index = c("location", "year"))
summary(sf.plm.fe.nb)
Both use the same data sets

Related

Fitting age adjusted model in R

I would like to fit a logistic regression model in R but adjust for age only. How can this be done? I know there is a way to fit univariable logistic model using gtsummary tbl_uvregression function. Can we tweak this?
library(tidyverse)
library(gtsummary)
set.seed(100)
trialdata <- tibble(
outcome = factor(rbinom(1000, 1, 0.10),
labels = c("No", "Yes")),
age = floor(rnorm(1000, 16, 3)),
score = floor(runif(1000, 30, 99)),
school = factor(rbinom(1000, 1, 0.35),
labels = c("No", "Yes")),
education = factor(rbinom(1000, 2, 0.20),
labels = c("Primary", "Secondary", "Tertiary"))
)
tbl_uv_ex1 <-
trialdata %>%
tbl_uvregression(
method = glm,
y = outcome,
method.args = list(family = binomial),
exponentiate = TRUE
)
tbl_uv_ex1

How to add environmental variable to DCA plot using ggplot2

I would like to plot an environmental variable on a ggplot2 version of a DCA plot.
I have some code where I extract species and data scores from vegan and then plot them up in ggplot2. I am having trouble trying to work out how I can get my environmental variable SWLI to plot as an arrow - something like this RDA's plots with ggvegan: How can I change text position for arrows text? (or see PCA example here https://www.rpubs.com/an-bui/vegan-cheat-sheet)
Can anybody help?
#DCA Plot
library(plyr)
library(vegan)
library(ggplot2)
library(cluster)
library(ggfortify)
library(factoextra)
#read in csv and remove variables you don't want to go through analysis
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
swli<-read_csv("DCAenv.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
regforamcountsall$Sample = NULL
regforamcountsall$Site=NULL
regforamcountsall$SWLI=NULL
#check csv
regforamcountsall
#run ordination
ord<-decorana(regforamcountsall)
#get species scores
summary(ord)
#get DCA values of environmental variable
ord.fit <- envfit(ord ~ SWLI, data=swli, perm=999)
ord.fit
plot(ord, dis="site")
plot(ord.fit)
#use this summary code to get species scores for DCA1 and DCA2
#put species scores values in from ord plot summary stats
species.scores<-read.csv("speciescores.csv")
species.scores$species <- row.names(species.scores)
#Using the scores function from vegan to extract the sample scores and convert to a data.frame
data.scores <- as.data.frame(scores(ord))
# create a column of groupings/clusters, from the rownames of data.scores
data.scores$endgroup <- as.factor(pam(regforamcountsall, 3)$clustering)
#getting the convex hull of each unique point set
find_hull <- function(df) df[chull(data.scores$DCA1, data.scores$DCA2), ]
hulls <- NULL
for(i in 1:length(unique(data.scores$endgroup))){
endgroup_coords <- data.scores[data.scores$endgroup == i,]
hull_coords <- data.frame(
endgroup_coords[chull(endgroup_coords[endgroup_coords$endgroup == i,]$DCA1,
endgroup_coords[endgroup_coords$endgroup == i,]$DCA2),])
hulls <- rbind(hulls,hull_coords)
}
data.scores$numbers <- 1:length(data.scores$endgroup)
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
data.scores$Site<-regforamcountsall$Site
data.scores$SWLI<-regforamcountsall$SWLI
data.scores
#DCA with species
data.scores$Site <- as.character(data.scores$Site)
library(scico)
dca <- ggplot() +
# add the point markers
geom_point(data=data.scores,aes(x=DCA1,y=DCA2,colour=SWLI,pch=Site),size=4) + geom_point(data=species.scores,aes(x=DCA1,y=DCA2),size=3,pch=3,alpha=0.8,colour="grey22") +
# add the hulls and labels - numbers position labels
geom_polygon(data = hulls,aes(x=DCA1,y=DCA2,fill=endgroup), alpha = 0.25) +
#geom_text(data=data.scores,aes(x=DCA1-0.03,y=DCA2,colour=endgroup, label = numbers))+
geom_text(data=species.scores,aes(x=DCA1+0.1,y=DCA2+0.1, label = species))+
#look this up
geom_segment(data=ord.fit,aes(x = 0, y = 0, xend=DCA1,yend=DCA2), arrow = arrow(length = unit(0.3, "cm")))+
theme_classic()+
scale_color_scico(palette = "lapaz")+
coord_fixed()
dca
#regforamcountsall data
structure(list(Sample = c("T3LB7.008", "T3LB7.18", "T3LB7.303",
"WAP 0 ST-2", "T3LB7.5", "LG120"), T.salsa = c(86.63793102, 68.5897436,
70.39274924, 5.199999999, 79.15057916, 44.40000001), H.wilberti = c(0,
0, 0, 0, 0.386100386, 9.399999998), Textularia = c(0, 0, 0, 0,
0, 0.4), T.irregularis = c(2.155172414, 10.25641026, 7.854984897,
0, 2.702702703, 0), P.ipohalina = c(0, 0, 0, 0, 0, 0), J.macrescens = c(4.741379311,
5.769230769, 4.833836859, 5.800000001, 8.108108107, 5.400000001
), T.inflata = c(6.465517244, 15.38461538, 16.918429, 83.2, 5.791505794,
40.4), S.lobata = c(0, 0, 0, 2.300000001, 0, 0), M.fusca = c(0,
0, 0, 3.499999999, 3.861003862, 0), A.agglutinans = c(0, 0, 0,
0, 0, 0), A.exiguus = c(0, 0, 0, 0, 0, 0), A.subcatenulatus = c(0,
0, 0, 0, 0, 0), P.hyperhalina = c(0, 0, 0, 0, 0, 0), SWLI = c(200,
197.799175, 194.497937, 192.034776, 191.746905, 190.397351),
Site = c("LSP", "LSP", "LSP", "WAP", "LSP", "LG")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#data.scores
structure(list(DCA1 = c(-1.88587476921648, -1.58550534382589,
-1.59816311314591, -0.0851161831632892, -1.69080448670088, -1.14488987340879
), DCA2 = c(0.320139736602921, 0.226662031865046, 0.230912045301637,
-0.0531232712001122, 0.272143119753744, 0.0696939776869396),
DCA3 = c(-0.755595015095353, -0.721144380683279, -0.675071834919103,
0.402339366526422, -0.731006052784081, 0.00474996849420783
), DCA4 = c(-1.10780013276303, -0.924265835490466, -0.957711953532202,
-0.434438970032073, -0.957873836258657, -0.508347000558056
), endgroup = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), numbers = 1:6, Site = c("LSP",
"LSP", "LSP", "WAP", "LSP", "LG"), SWLI = c(200, 197.799175,
194.497937, 192.034776, 191.746905, 190.397351)), row.names = c(NA,
6L), class = "data.frame")
#species.scores
structure(list(species = c("1", "2", "3", "4", "5", "6"), DCA1 = c(-2.13,
-1.6996, -2.0172, -0.9689, 1.0372, -0.3224), DCA2 = c(0.342,
-0.8114, 0.3467, -0.3454, 2.0007, 0.9147)), row.names = c(NA,
6L), class = "data.frame")

Displaying SPEI with ggplot: geom_area ends/starts at different positions when sign changes

Here's some data
structure(list(Period = structure(c(2017.83333333333, 2017.91666666667,
2018, 2018.08333333333, 2018.16666666667, 2018.25, 2018.33333333333,
2018.41666666667, 2018.5, 2018.58333333333, 2018.66666666667,
2018.75, 2018.83333333333, 2018.91666666667, 2019, 2019.08333333333,
2019.16666666667, 2019.25, 2019.33333333333, 2019.41666666667,
2019.5), class = "yearmon"), neg = c(0, 0, 0, 0, 0, 0, 0, 0,
-0.782066446199374, -1.33087717414387, -1.55401649141939, -1.9056578851487,
-2.19869230289699, -1.99579537718088, -2.03857957860623, -2.14184701726747,
-2.27461866979037, -2.39022691659445, -2.3732334198156, -1.83686080707261,
-1.86553025598681), pos = c(0.550567625206492, 0.699954781241267,
0.775518140437689, 0.647367030217637, 0.84562688020279, 0.923814518387379,
0.686796306801202, 0.131849327496122, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0)), row.names = 960:980, class = "data.frame")
I want to plot the SPEI values with ggplot as I learned it here:
How to format the x-axis of the hard coded plotting function of SPEI package in R?
library(ggplot2)
ggplot(test) +
geom_area(aes(x = Period, y = pos), fill = "blue", col = "black") +
geom_area(aes(x = Period, y = neg), fill = "red", col = "black") +
scale_y_continuous(limits = c(-2.25, 2.25),
breaks = -2:2) +
ylab("SPEI") + xlab("") +
theme_bw()
The result looks like this:
As you can see, when the sign changes from positive to negative, geom_area doesn't end/start at the same position. Anyone any idea how to fix this? I thought about using Date instead of yearmon, but got stuck with the same problem.
This is a gates and posts problem: Each geom_area at the inflection is starting and ending on a post, hence the overlap. They should be starting in the middle of the gate between the posts.
This solution may be a bit heavy handed but I think it should apply where there are multiple changes from positive to negative and vice versa.
library(ggplot2)
library(tidyr)
library(tibble)
library(dplyr)
library(lubridate)
library(imputeTS)
Determine when the data changes from positive to negative or vice versa
inflections <-
test %>%
mutate(inflect = case_when(lag(neg) == 0 & pos == 0 ~ TRUE,
lag(pos) == 0 & neg == 0 ~ TRUE,
TRUE ~ FALSE),
rowid = row_number() - 0.5) %>%
filter(inflect) %>%
select(-inflect) %>%
mutate(Period = NA_Date_,
pos = 0,
neg = 0)
Insert a new row to mark the inflection point to allow inclusion of an intermediary time where both pos and neg can be zero.
test1 <-
test %>%
rowid_to_column() %>%
bind_rows(inflections) %>%
arrange(rowid)
Impute a time when the data changes from pos to neg with a function from imputeTS.
test1$Period <- na_interpolation(as.ts(test1$Period))
plot
ggplot(test1) +
geom_area(aes(x = Period, y = pos), fill = "blue", col = "black") +
geom_area(aes(x = Period, y = neg), fill = "red", col = "black") +
scale_y_continuous(limits = c(-2.25, 2.25),
breaks = -2:2) +
ylab("SPEI") + xlab("") +
theme_bw()
data
```
test <- structure(list(Period = structure(c(2017.83333333333, 2017.91666666667,
2018, 2018.08333333333, 2018.16666666667, 2018.25, 2018.33333333333,
2018.41666666667, 2018.5, 2018.58333333333, 2018.66666666667,
2018.75, 2018.83333333333, 2018.91666666667, 2019, 2019.08333333333,
2019.16666666667, 2019.25, 2019.33333333333, 2019.41666666667,
2019.5), class = "yearmon"), neg = c(0, 0, 0, 0, 0, 0, 0, 0,
-0.782066446199374, -1.33087717414387, -1.55401649141939, -1.9056578851487,
-2.19869230289699, -1.99579537718088, -2.03857957860623, -2.14184701726747,
-2.27461866979037, -2.39022691659445, -2.3732334198156, -1.83686080707261,
-1.86553025598681), pos = c(0.550567625206492, 0.699954781241267,
0.775518140437689, 0.647367030217637, 0.84562688020279, 0.923814518387379,
0.686796306801202, 0.131849327496122, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0)), row.names = 960:980, class = "data.frame")
```
<sup>Created on 2020-05-22 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>

Stargazer Confidence Interval Incorrect?

So I am really fond of the stargazer package for displaying the statistics for regression models. I've been using R and Stata together to complete some problems in a textbook. One issue that I have found is that the confidence interval printed by the stargazer package does not correspond to the confidence interval by stata. I determined that the CI in stata is the correct one after doing it by hand.
Because the issue might may possibly lie in how I am handling the data, I offer it here as an optional choice. My primary concern is to determine why the CI's do not respond. From a previous post, here is one possible way of finding the data I am using;
install.packages("devtools") # if not already installed
library(devtools)
install_git("https://github.com/ccolonescu/PoEdata")
library(PoEdata) # loads the package in memory
library(multcomp) # for hypo testing
data(fair4) # loads the data set of interest
In Stata, the name of the dataset I am using is called fair4.dta. For the data itself, you can use it manually,
structure(list(year = structure(c(1880, 1884, 1888, 1892, 1896,
1900, 1904, 1908, 1912, 1916, 1920, 1924, 1928, 1932, 1936, 1940,
1944, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984,
1988, 1992, 1996, 2000, 2004, 2008), label = "year", format.stata = "%9.0g"),
vote = structure(c(50.2200012207031, 49.8460006713867, 50.4140014648438,
48.2680015563965, 47.7599983215332, 53.1710014343262, 60.0060005187988,
54.4830017089844, 54.7080001831055, 51.681999206543, 36.1189994812012,
58.2439994812012, 58.8199996948242, 40.8409996032715, 62.4580001831055,
54.9990005493164, 53.773998260498, 52.3699989318848, 44.5950012207031,
57.7639999389648, 49.9129981994629, 61.3440017700195, 49.5960006713867,
61.7890014648438, 48.9480018615723, 44.6969985961914, 59.1699981689453,
53.9020004272461, 46.5449981689453, 54.7360000610352, 50.2649993896484,
51.2330017089844, 46.5999984741211), label = "Incumbent share of the two-party presidential vote", format.stata = "%9.0g"),
party = structure(c(-1, -1, 1, -1, 1, -1, -1, -1, -1, 1,
1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1,
-1, -1, 1, 1, -1, -1), label = "= 1 if Democratic incumbent at election time; -1 if a Republican incumbent", format.stata = "%9.0g"),
person = structure(c(0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
1, 0), label = "= 1 if incumbent is running for election and 0 otherwise", format.stata = "%9.0g"),
duration = structure(c(1.75, 2, 0, 0, 0, 0, 1, 1.25, 1.5,
0, 1, 0, 1, 1.25, 0, 1, 1.25, 1.5, 1.75, 0, 1, 0, 1, 0, 1,
0, 0, 1, 1.25, 0, 1, 0, 1), label = "number of terms incumbent administration in power", format.stata = "%9.0g"),
war = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "= 1 for elections of 1920, 1944, and 1948 and 0 otherwise.", format.stata = "%9.0g"),
growth = structure(c(3.87899994850159, 1.58899998664856,
-5.55299997329712, 2.76300001144409, -10.0240001678467, -1.42499995231628,
-2.4210000038147, -6.2810001373291, 4.16400003433228, 2.22900009155273,
-11.4630002975464, -3.87199997901917, 4.6230001449585, -14.4989995956421,
11.7650003433228, 3.90199995040894, 4.27899980545044, 3.5789999961853,
0.690999984741211, -1.45099997520447, 0.377000004053116,
5.10900020599365, 5.04300022125244, 5.91400003433228, 3.75099992752075,
-3.59699988365173, 5.44000005722046, 2.17799997329712, 2.66199994087219,
3.12100005149841, 1.21899998188019, 2.69000005722046, 0.219999998807907
), label = "growth rate GDP in first three quarters of the election year", format.stata = "%9.0g"),
inflation = structure(c(1.97399997711182, 1.05499994754791,
0.603999972343445, 2.2739999294281, 3.41000008583069, 2.54800009727478,
1.44200003147125, 1.87899994850159, 2.17199993133545, 4.2519998550415,
0, 5.16099977493286, 0.18299999833107, 7.19999980926514,
2.49699997901917, 0.0810000002384186, 0, 0, 2.36199998855591,
1.93499994277954, 1.96700000762939, 1.25999999046326, 3.13899993896484,
4.81500005722046, 7.63000011444092, 7.83099985122681, 5.25899982452393,
2.90599989891052, 3.27999997138977, 2.06200003623962, 1.60500001907349,
2.32500004768372, 2.88000011444092), label = "growth rate of GDP deflator during first 15 quarters of admin", format.stata = "%9.0g"),
goodnews = structure(c(9, 2, 3, 7, 6, 7, 5, 8, 8, 3, 0, 10,
7, 4, 9, 8, 0, 0, 7, 5, 5, 10, 7, 4, 5, 5, 8, 4, 2, 4, 8,
1, 3), label = "number of quarters in first 15 with real GDP per capita growth > 3.2", format.stata = "%9.0g")), notes = c("more complete variable definitions in fair.def",
"1"), .Names = c("year", "vote", "party", "person", "duration",
"war", "growth", "inflation", "goodnews"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -33L))
So here is the stargazer code that is giving me trouble:
presidential <- read_dta("~/Directory/fair4.dta")
pres.lm = lm(vote ~ growth, data = subset(presidential,
presidential$year >= 1916)
stargazer(pres.lm,
type = "text",
intercept.bottom = T,
digits = 5,
report = "vc*stp",
ci = T
)
confint(pres.lm, level = 0.95)
Consider the difference in the confidence intervals.
(0.52948, 1.24241) # in R, Stargazer
0.5087671 1.263126 # in R, confint(pres.lm)
.5087671 1.263126 # in Stata
I also calculated by hand for the confidence intervals and the confit() and the Stata numbers check out. The t-critical value for this dataset should be t_(N-2 , prob) = t(22,.0025) = -2.073873.
In addition, I made sure to create an entirely new data frame. That is, instead of subsetting within the the lm() argument, I subset it first. When comparing this method to the previous one, I still get the same exact (incorrect) confidence intervals.
# subset into a new dataframe
presidential.1 = subset(presidential, presidential$year >= 1916)
# create the model
pres.lm.2 = lm(vote ~ growth, data = presidential.1)
# compare the two
stargazer(pres.lm,pres.lm.2,
type = "text",
intercept.bottom = F,
digits = 5,
report = "vc*stp",
ci = T,
t.auto = T)
(1) (2)
-----------------------------------------------------------------------
Constant 50.84840*** 50.84840***
(48.86384, 52.83295) (48.86384, 52.83295)
t = 50.21835 t = 50.21835
p = 0.00000 p = 0.00000
growth 0.88595*** 0.88595***
(0.52948, 1.24241) (0.52948, 1.24241)
t = 4.87126 t = 4.87126
p = 0.00008 p = 0.00008
# correct intervals from Stata and R's confint()
growth 0.5087671 1.263126
Am I running the code incorrectly? It really isn't a big deal for me to run the stargazer command and print only the coefficients and the t-stats, but it is kind of disappointing that I would have to run confint() as a separate command given that the output for Stargazer is gorgeous. It is quite odd because the coefficient estimates and the t-statistics are perfect. The confidence intervals are off by varying degrees, and I would like to know what the cause of this might be. Any advice would be greatly appreciated.
The simple answer is that stata and confint calculate confidence intervals using the t-distribution, while stargazer's internal method uses the normal distribution. The result is that the former two are more conservative in their estimates and thus have wider CI compared to stargazer. (Well, I'm assuming with stata here, but since it gives the same results as confint I feel it is a safe assumption).
Looking deep into the source code for stargazer (line 688ff) we can find how CIs are calculated:
z.value <- qnorm((1 + .format.ci.level.use)/2)
coef <- .global.coefficients[.global.coefficient.variables[which.variable],i]
se <- .global.std.errors[.global.coefficient.variables[which.variable],i]
ci.lower.bound <- coef - z.value * se
ci.upper.bound <- coef + z.value * se
It uses qnorm to set the critical value.
Compare to confint:
a <- (1 - level)/2
a <- c(a, 1 - a)
fac <- qt(a, object$df.residual) ##Relevant line, uses T-distribution
pct <- format.perc(a, 3)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
pct))
ses <- sqrt(diag(vcov(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
Compare:
#Using normal/z distribution
> pres.lm$coefficients[2] + sqrt(diag(vcov(pres.lm)))[2] %o% c(-qnorm((1 + 0.95)/2), qnorm((1 + 0.95)/2))
[,1] [,2]
growth 0.5294839 1.242409
#Using t-distribution with df degrees of freedom
> df <- pres.lm$df.residual
> pres.lm$coefficients[2] + sqrt(diag(vcov(pres.lm)))[2] %o% c(-qt((1 + 0.95)/2, df), qt((1 + 0.95)/2, df))
[,1] [,2]
growth 0.5087671 1.263126
Probably the easiest way to handle this if you are committed to stargazer is to use the ci.custom argument:
> stargazer(pres.lm, type = "text", ci.custom = list(confint(pres.lm)))
===============================================
Dependent variable:
---------------------------
vote
-----------------------------------------------
growth 0.886***
(0.509, 1.263)
Constant 50.848***
(48.749, 52.948)
-----------------------------------------------
Observations 24
R2 0.519
Adjusted R2 0.497
Residual Std. Error 4.798 (df = 22)
F Statistic 23.729*** (df = 1; 22)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
Once the sample size is sufficiently large, the t-distribution converges on the z-distribution and the differences between the CIs become much smaller.
set.seed(432)
x1 <- rnorm(10000, 100, 50)
u <- 2 * rnorm(10000)
y <- 50 + x1 * 0.752 * u
fit <- lm(y ~ x1)
> confint(fit)
2.5 % 97.5 %
(Intercept) 39.29108955 54.1821315
x1 -0.02782141 0.1061173
> stargazer(fit, type= "text", ci = T)
===============================================
Dependent variable:
---------------------------
y
-----------------------------------------------
x1 0.039
(-0.028, 0.106)
Constant 46.737***
(39.292, 54.181)
-----------------------------------------------
Observations 10,000
R2 0.0001
Adjusted R2 0.00003
Residual Std. Error 168.194 (df = 9998)
F Statistic 1.313 (df = 1; 9998)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
With a sample size of 24, the t-distribution with 22 degrees of freedom has much fatter tails than the z!

Create new column with percentages in data frame

I have the following dataframe:
dput(df1)
structure(list(month = c(1, 1, 2, 2, 3, 4), transaction_type = c("AAA",
"BBB", "BBB", "CCC",
"DDD", "AAA"), max_wt_per_month = c(54.9,
51.6833333333333, 52.3333333333333, 49.4666666666667, 49.85,
48.5833333333333), min_wt_per_month = c(0, 0, 0, 0, 0, 0), avg_wt_per_month = c(8.41701333107861,
7.65211141060198, 6.44184012508551, 7.74798927613941, 7.4360566888844,
7.50611319574734), prop = c(Inf, Inf, Inf, Inf, Inf, Inf)), .Names = c("month",
"transaction_type", "max_wt_per_month", "min_wt_per_month", "avg_wt_per_month",
"prop"), row.names = c(NA, -6L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = list(month), drop = TRUE, indices = list(
0:5), group_sizes = 6L, biggest_group_size = 6L, labels = structure(list(
month = 1), row.names = c(NA, -1L), class = "data.frame", vars = list(
month), drop = TRUE, .Names = "month"))
I want to create column prop that would contain the percentage of maximum waiting time with respect to each month. If I run this code, then I get Inf values in most of the rows... (especially it is evident in the real dataset):
my_fun=function(vec){
100*as.numeric(vec[3]) /
sum(with(data_merged_transactions, ifelse(month == vec[1], max_wt_per_month, 0))) }
data_merged_transactions$prop=apply(data_merged_transactions , 1 , my_fun)
I then finally need to create the filled area chart so that each area would be a percentage out of 100%:
ggplot(data_merged_transactions, aes(x=month, y=prop, fill=transaction_type)) +
geom_area(alpha=0.6 , size=1, colour="black")
Why do I get Inf if the sum is not equal to 0?
Moreover, is it possible to create filled area chart with months being factors (Jan, Feb,etc.), not numbers? I tried to substitute month id's by month names, but then I got very thin bars instead of a filled area.
Is this what you were looking for?
library(tidyverse)
df1_tidy <- df1 %>%
group_by(month) %>%
summarise(SUM = sum(max_wt_per_month)) %>%
full_join(df1) %>%
mutate(prop = max_wt_per_month / SUM)
ggplot(data = df1_tidy,
aes(x = month,
y = prop,
fill = transaction_type)) +
geom_area(alpha = 0.6,
size = 1,
colour = "black") +
scale_x_continuous(labels = c("Jan", "Feb", "Mar", "Apr"))

Resources