I have the df1:
Name Y_N FIPS score1 score2
1: Alabama 0 1 2633 8
2: Alaska 0 2 382 1
3: Arizona 1 4 2695 41
4: Arkansas 1 5 2039 10
5: California 1 6 27813 524
6: Colorado 0 8 8609 133
7: Connecticut 1 9 5390 111
8: Delaware 0 10 858 3
9: Florida 1 12 14172 215
10: Georgia 1 13 9847 308
11: Hawaii 0 15 720 0
12: Idaho 1 16 845 7
I would like to perform a T-test to see if score1 differs based on Y_N. I would then like to plot these two against each other. I have made a boxplot that looks like:
Instead I want my graph to look like except with confidence bars: I want to now change from a boxplot to a plot that shows all of the individual points and then a mean horizontal line with 95% confidence intervals. How is this done? I would also like to add the text of the p-value in a corner of the graph.
I might try:
text(x = max(df1$Y_N)+1,
y = min(df1$score1)+20000,
labels = paste0(
"\np-value = ",
round(coef_lm[2,4],5),
pos = 4)
But I realize that coef_lm[2,4],5 are the test-statistics from a linear model. How do I access the outputs of a t-test?
I'm not sure why you added that extra point in your code. But on your original data, you might use ggplot2 and ggpubr.
Edit
Now more like your paint drawing.
ggplot(df1,aes(x = as.factor(Y_N), y = score1)) +
geom_jitter(position = position_jitter(0.1)) +
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = 0.3) +
stat_summary(fun = "mean", geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), col = "red", width = 0.5) +
stat_compare_means(method="t.test") +
xlab("Group") + ylab("Score 1")
Original Data
df1 <- structure(list(Name = structure(1:12, .Label = c("Alabama", "Alaska",
"Arizona", "Arkansas", "California", "Colorado", "Connecticut",
"Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), class = "factor"),
Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L),
FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L,
16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L,
5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L,
41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:",
"12:"))
Alternatively, without to install ggpubr you can calculate p value outside of ggplot2 and use annotate function to add the pvalue into the plot:
pval <- t.test(score1~Y_N,data = df)$p.value
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1, fill = as.factor(Y_N), color = as.factor(Y_N)))+
geom_boxplot(alpha = 0.3, color = "black", outlier.shape = NA)+
geom_jitter(show.legend = FALSE)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)), x = 1.5, y = max(df$score1)*0.9)
EDIT: Without a boxplot
Alternatively to the boxplot, if you want to have individual points and a bar representing the mean, you can first calculate the mean per group in a ne dataset (here I'm using dplyr package for doing it):
library(dplyr)
Mean_df <- df %>% group_by(Y_N) %>% summarise(Mean = mean(score1))
# A tibble: 2 x 2
Y_N Mean
<int> <dbl>
1 0 2640.
2 1 8972.
Then, you can plot individual points using geom_jitter and the mean using geom_errobar by calling the new dataset Mean_df:
library(ggplot2)
ggplot(df, aes(x = as.factor(Y_N), y = score1))+
geom_jitter(show.legend = FALSE, width = 0.2)+
geom_errorbar(inherit.aes = FALSE, data = Mean_df,
aes(x = as.factor(Y_N),ymin = Mean, ymax = Mean),
color = "red",width = 0.2)+
annotate(geom = "text", label = paste("p.value: ",round(pval,3)),
x = 1.5, y = max(df$score1)*0.9)
Reproducible example
structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas",
"California", "Colorado", "Connecticut", "Delaware", "Florida",
"Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L,
10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L,
27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L,
1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA,
-12L), class = c("data.table", "data.frame"))
dd <- structure(list(Name = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho"), Y_N = c(0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L), FIPS = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 15L, 16L), score1 = c(2633L, 382L, 2695L, 2039L, 27813L, 8609L, 5390L, 858L, 14172L, 9847L, 720L, 845L), score2 = c(8L, 1L, 41L, 10L, 524L, 133L, 111L, 3L, 215L, 308L, 0L, 7L)), row.names = c(NA, -12L), class = c("data.table", "data.frame"))
## frame
boxplot(score1 ~ Y_N, dd, border = NA)
## 95% ci, medians
sp <- split(dd$score1, dd$Y_N)
sapply(seq_along(sp), function(ii) {
x <- sp[[ii]]
arrows(ii, quantile(x, 0.025), ii, quantile(x, 0.975), code = 3, angle = 90, length = 0.1)
segments(ii - 0.05, median(x), ii + 0.05, col = 'red', lwd = 2)
})
points(dd$Y_N + 1, dd$score1, col = dd$Y_N + 1)
## t-test
lbl <- sprintf('p = %s', format.pval(t.test(score1 ~ Y_N, dd)$p.value, digits = 2))
mtext(lbl, at = par('usr')[2], adj = 1)
One of your questions relates to how to access the t.test statistics. Here's an answer to that question. Suppose you have that type of data:
set.seed(12)
YN <- sample(0:1, 100, replace = T)
score1 <- sample(500:1500, 100, replace = T)
df <- data.frame(YN, score1)
And suppose further that you run and store the t.test like this:
test <- tapply(df$score1, df$YN, t.test)
Then you can access the test statistics bit by bit like this, illustrated here for the factor level 0:
test$`0`$p.value # p-value
test$`0`$conf.int # confidence interval
test$`0`$estimate # estimate
test$`0`$statistic # statistic
Now obviously you will not want to do it manually bit by bit but in a more autmated and systematic way. This is how you can achieve this:
df1 <- do.call(rbind, lapply(test, function(x) c(
statistic = unname(x$statistic),
ci = unname(x$conf.int),
est = unname(x$estimate),
pval = unname(x$p.value))))
The ouput is this:
statistic ci1 ci2 est pval
0 22.31155 837.3901 1003.263 920.3265 5.484012e-27
1 22.91558 870.5426 1037.810 954.1765 3.543693e-28
Related
I want to compare the means of the variables in a barplot.
This is a portion of my dataframe.
Group Gender Age Anxiety_score Depression_score IUS OBSC
1 Anxiety 0 25 32 29 12
2 Anxiety 1 48 34 28 11
3 Anxiety 0 32 48 32 12
4 Anxiety 1 24 43 26 12
5 Anxiety 1 18 44 26 15
6 Control 0 45 12 11 3
7 Control 0 44 11 11 5
8 Control 1 26 21 10 5
9 Control 1 38 12 NA 2
10 Control 0 18 13 10 1
I'd like to create a barplot where each variable (Gender, Age, Anxiety_score, depression_score, IUS, ...) represents a bar and I'd like to have this for each group (anxiety vs control next to each other, not stacked) on the same graph. The height of the bar would represent the mean. For gender, I'd like to have the gender ratio. I also want to map the variables on the y axis. How do I do this in R?
This type of problems generally has to do with reshaping the data. The format should be the long format and the data is in wide format. See this post on how to reshape the data from wide to long format.
Then, group by Group and name, compute the means and plot.
library(dplyr)
library(tidyr)
library(ggplot2)
df1 %>%
pivot_longer(-Group) %>%
group_by(Group, name) %>%
summarise(value = mean(value), .groups = "drop") %>%
ggplot(aes(name, value, fill = Group)) +
geom_col(position = position_dodge()) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Data
df1 <-
structure(list(Group = c("Anxiety", "Anxiety", "Anxiety", "Anxiety",
"Anxiety", "Control", "Control", "Control", "Control", "Control"
), Gender = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L), Age = c(25L,
48L, 32L, 24L, 18L, 45L, 44L, 26L, 38L, 18L), Anxiety_score = c(32L,
34L, 48L, 43L, 44L, 12L, 11L, 21L, 12L, 13L), Depression_score = c(29L,
28L, 32L, 26L, 26L, 11L, 11L, 10L, NA, 10L), IUS = c(12L, 11L,
12L, 12L, 15L, 3L, 5L, 5L, 2L, 1L)), class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
Are you looking for something like this?
library(tidyverse)
df %>%
pivot_longer(
-Group
) %>%
group_by(Group, name) %>%
summarise(Mean=mean(value, na.rm=TRUE)) %>%
ggplot(aes(x=factor(Group), y=Mean, fill=name))+
geom_col(aes(group=name), position = "dodge") +
geom_text(
aes(label = Mean, y = Mean + 0.05),
position = position_dodge(0.9),
vjust = 0
)
I was wondering how to perform multiple independent linear regressions on a combination of levels for two or more factor variables.
Let's say our dataset has one dependent continuous variable, and then two factor independent variables and one continuous independent variable.
Then let's say our regression formula in r is this:
model <- lm(weight ~ city + diet + height)
Or, to write in pseudo code i'm trying to do this:
lm(weight ~ height) %>% group by city
lm(weight ~ height) %>% group by diet
lm(weight ~ height) %>% group by city & diet
I know that we could run a linear regression for each city and diet one by one, but do you know of a way we could create a loop so that we do an independent regression for each city and diet in our dataset?
To illustrate this better I've made this fake dataset in this image and then listed the three types of outputs I would want. However, I don't want to manually do them one by one, but would rather use a loop.
Does anyone know how to do this in r?
We can define the model specification in a list and then use lapply() over the list of desired models.
Code
models <- list("m1" = c("weight", "height"),
"m2" = c("weight", "height", "city"),
"m3" = c("weight", "height", "diet"),
"m4" = c("weight", "height", "diet", "city"))
lapply(models, function(x){
lm(weight ~ ., data = df[, x])
})
# $m1
#
# Call:
# lm(formula = weight ~ ., data = df[, x])
#
# Coefficients:
# (Intercept) height
# -0.2970 0.1219
#
#
# $m2
#
# Call:
# lm(formula = weight ~ ., data = df[, x])
#
# Coefficients:
# (Intercept) height cityHouston
# -0.3705 0.1259 0.1205
#
#
# $m3
#
# Call:
# lm(formula = weight ~ ., data = df[, x])
#
# Coefficients:
# (Intercept) height dietVegan dietVegetarian
# -0.1905 0.1270 -0.1288 -0.1757
#
#
# $m4
#
# Call:
# lm(formula = weight ~ ., data = df[, x])
#
# Coefficients:
# (Intercept) height dietVegan dietVegetarian cityHouston
# -0.2615 0.1310 -0.1417 -0.1663 0.1197
Data
df <- data.frame("weight" = rnorm(100),
"height" = rexp(100),
"diet" = as.factor(sample(c("Vegan", "Vegetarian", "Meat"), replace = TRUE, 100)),
"city" = as.factor(sample(c("Houston", "Chicago"), replace = TRUE, 100)))
First define a small regfun that computes the desired summary statistics. Then, using by apply it group-wise. For the combination of two groups we may paste the columns together use the interaction function : for factors.
regfun <- function(x) summary(lm(w ~ h, x))$coe[2, c(1, 4)]
do.call(rbind, by(d, d$city, regfun))
# Estimate Pr(>|t|)
# a -0.1879530 0.4374580
# b -0.2143780 0.4674864
# c -0.2866948 0.5131854
do.call(rbind, by(d, d$diet, regfun))
# Estimate Pr(>|t|)
# y -0.1997162 0.3412652
# z -0.3512349 0.4312766
# do.call(rbind, by(d, Reduce(paste, d[1:2]), regfun))
with(d, do.call(rbind, by(d, city:diet, regfun))) ## credits to #G.Grothendieck
# Estimate Pr(>|t|)
# a y -0.2591764 0.5576043
# a z -0.1543536 0.8158689
# b y -0.1966501 0.7485405
# b z -0.4354839 0.7461538
# c y -0.5000000 0.3333333
# c z -1.0671642 0.7221495
Edit
If we have an unbalanced panel, i.e. with(d, city:diet) gives "impossible" combinations that aren't actually in the data, we have to code this slightly different. You can think of by as a combination of first split then lapply, so let's to that. Because we'll get errors, we may use tryCatch to provide a similar substitute.
s <- with(d2, split(d2, city:diet))
do.call(rbind, lapply(s, function(x)
tryCatch(regfun(x),
error=function(e) cbind.data.frame(Estimate=NA, `Pr(>|t|)`=NA))))
# Estimate Pr(>|t|)
# a:y -0.2591764 0.5576043
# a:z NA NA
# b:y 5.2500000 NaN
# b:z NA NA
# c:y -0.5000000 0.3333333
# c:z 9.5000000 NaN
# d:y NA NA
# d:z 1.4285714 NaN
# e:y NA NA
# e:z -7.0000000 NaN
# f:y NA NA
# f:z 2.0000000 NaN
Data:
d <- structure(list(city = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("a",
"b", "c"), class = "factor"), diet = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("y",
"z"), class = "factor"), id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), w = c(66L, 54L, 50L,
74L, 59L, 53L, 67L, 75L, 66L, 64L, 73L, 56L, 53L, 74L, 54L, 63L,
69L, 75L), h = c(152L, 190L, 174L, 176L, 185L, 186L, 180L, 194L,
154L, 169L, 183L, 177L, 189L, 152L, 182L, 191L, 173L, 179L)), out.attrs = list(
dim = c(city = 3L, diet = 2L, id = 3L), dimnames = list(city = c("city=a",
"city=b", "city=c"), diet = c("diet=y", "diet=z"), id = c("id=1",
"id=2", "id=3"))), row.names = c(NA, -18L), class = "data.frame")
d2 <- structure(list(city = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 3L, 1L, 6L, 3L, 6L, 2L, 3L), .Label = c("a",
"b", "c", "d", "e", "f"), class = "factor"), diet = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L), .Label = c("y", "z"), class = "factor"), id = c(1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L
), w = c(66L, 54L, 50L, 74L, 59L, 53L, 67L, 75L, 66L, 64L, 73L,
56L, 53L, 74L, 54L, 63L, 69L, 75L), h = c(152L, 190L, 174L, 176L,
185L, 186L, 180L, 194L, 154L, 169L, 183L, 177L, 189L, 152L, 182L,
191L, 173L, 179L)), out.attrs = list(dim = c(city = 3L, diet = 2L,
id = 3L), dimnames = list(city = c("city=a", "city=b", "city=c"
), diet = c("diet=y", "diet=z"), id = c("id=1", "id=2", "id=3"
))), row.names = c(NA, -18L), class = "data.frame")
To expand on my comments about the efficiency of the pooled analysis over that of the subgroup analyses...
Using starwars as (a less than ideal) starting point:
d <- starwars %>%
filter(mass < 1000) %>% # Exclude Jabba
mutate(maleOrNot=ifelse(sex=="male", sex, "other")) %>%
replace_na(list(maleOrNot="other"))
For the sake of argument, say we want to regress a character's mass based only on whether they are male or not and their height and then obtain the standard error of the predicted mass at the mean height.
pData <- d %>%
group_by(maleOrNot) %>%
summarise(height=mean(height), .groups="drop")
pData
# A tibble: 2 x 2
maleOrNot height
* <chr> <dbl>
1 male 178.
2 other 162.
By group analyses
lapply(
d %>% pull(maleOrNot) %>% unique(),
function(x) {
m <- lm(mass ~ height, d %>% filter(maleOrNot == x))
predict(m, pData %>% filter(maleOrNot == x), se.fit=TRUE)$se.fit
}
)
[[1]]
[1] 2.656427
[[2]]
[1] 5.855176
Now the pooled analysis:
m <- lm(mass ~ maleOrNot + height, d)
predict(m, pData, se.fit=TRUE)$se.fit
1 2
2.789770 4.945734
The prediction for the non-males is slightly (5%) less precise, but for males, precision is improved by 15.5%.
But the model isn't particularly good. Perhaps an interactioon model will improve things:
m <- lm(mass ~ maleOrNot:height, d)
predict(m, pData, se.fit=TRUE)$se.fit
1 2
2.776478 4.880154
Now the figures are 4.5% worse and 16.7% better. Including other terms in the model may well improve the precision even more.
In general terms (though there are exceptions), fitting a pooled model is unlikely to reduce precsion compared to fitting several subgroup models and can substantially improve precision. This is because all groups contribute to the estimation of the (common) variance.
In terms of computation time:
library(microbenchmark)
byGroup <- function() {
lapply(
d %>% pull(maleOrNot) %>% unique(),
function(x) {
m <- lm(mass ~ height, d %>% filter(maleOrNot == x))
predict(m, pData %>% filter(maleOrNot == x), se.fit=TRUE)$se.fit
}
)
}
pooled <- function() {
m <- lm(mass ~ maleOrNot + height, d)
predict(m, pData, se.fit=TRUE)$se.fit
}
microbenchmark(byGroup, pooled, times=100)
Unit: nanoseconds
expr min lq mean median uq max neval
byGroup 44 45.5 55.22 47 48 891 100
pooled 42 44.0 60.27 46 47 1434 100
So for this simple case, there's virtually no difference. More complex examples may give different answers.
I have two pairs of columns I want to merge into one. I am a beginner in R, so I do not know, if what I ask is considered simple or not, but I didn't manage to find what I search for on the Internet.
The first pair of columns are "issue_d" (month and year of loan issue) and "earliest_cr_line" (month and year of earliest credit line being opened by the borrower). They are both filled with values that look this way "Dec-2011". I want to create a new joint column to replace them both by subtracting the date of "earliest_cr_line" from "issue_d" for each row, so that my new column "credit_history_length" represents how much time passed since each borrower opened their first credit before issuing their respective loan. How can I do that? By the way, str(dataframe) shows that the values in both columns are "chr", which stands for "character", I believe; I was somewhat confused by the whole factor/character problem in several posts I have seen here, so I thought I better provide this piece of information just in case.
Additionaly, I have a second pair of columns "fico_range_high" and "fico_range_low", they both contain "int" values like "600" and "608", I want to replace them with a single column that shows the arithmetic means for each row like "604" for the example above.
How can I create the two desired columns?
Edit: As MrSmithGoesToWashington kindly advised, I put a reproducible sample in here:
structure(list(X.1 = 1:5, X = 1:5, id = c(1077501L, 1077430L,
1077175L, 1076863L, 1075358L), loan_amnt = c(5000L, 2500L, 2400L,
10000L, 3000L), term = c(" 36 months", " 60 months", " 36 months",
" 36 months", " 60 months"), grade = c("B", "C", "C", "C", "B"
), sub_grade = c("B2", "C4", "C5", "C1", "B5"), emp_length = c("10+ years",
"< 1 year", "10+ years", "10+ years", "1 year"), home_ownership = c("RENT",
"RENT", "RENT", "RENT", "RENT"), annual_inc = c(24000, 30000,
12252, 49200, 80000), verification_status = c("Verified", "Source Verified",
"Not Verified", "Source Verified", "Source Verified"), issue_d = c("Dec-2011",
"Dec-2011", "Dec-2011", "Dec-2011", "Dec-2011"), loan_status = c("Fully Paid",
"Charged Off", "Fully Paid", "Fully Paid", "Fully Paid"), purpose = c("credit_card",
"car", "small_business", "other", "other"), dti = c(27.65, 1,
8.72, 20, 17.94), delinq_2yrs = c(0L, 0L, 0L, 0L, 0L), earliest_cr_line = c("Jan-1985",
"Apr-1999", "Nov-2001", "Feb-1996", "Jan-1996"), fico_range_low = c(735L,
740L, 735L, 690L, 695L), fico_range_high = c(739L, 744L, 739L,
694L, 699L), inq_last_6mths = c(1L, 5L, 2L, 1L, 0L), mths_since_last_delinq = c(NA,
NA, NA, 35L, 38L), open_acc = c(3L, 3L, 2L, 10L, 15L), pub_rec = c(0L,
0L, 0L, 0L, 0L), revol_util = c("83.7%", "9.4%", "98.5%", "21%",
"53.9%"), total_acc = c(9L, 4L, 10L, 37L, 38L), acc_now_delinq = c(0L,
0L, 0L, 0L, 0L), chargeoff_within_12_mths = c(0L, 0L, 0L, 0L,
0L), delinq_amnt = c(0L, 0L, 0L, 0L, 0L), pub_rec_bankruptcies = c(0L,
0L, 0L, 0L, 0L), tax_liens = c(0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
5L), class = "data.frame")
Naming your input data dat, we could use the following:
library(tidyverse)
dat %>%
mutate(issue_d_date = as.Date(paste0(issue_d, "-01"), format = "%b-%Y-%d"),
earliest_cr_line_date = as.Date(paste0(earliest_cr_line, "-01"), format = "%b-%Y-%d"),
credit_history_length = issue_d_date - earliest_cr_line_date,
fico_mean = apply(across(c(fico_range_high, fico_range_low)), 1, mean, na.rm = TRUE))
Note: since you want to calculate the difference between dates, they also need a day component, so I first appended the first day of each month to your two date variables, and then calculated their difference.
Note 2: it seems your fico range high is always 2 points higher than fico range low. So instead of calculating the mean, a possible alternative could be to simply add 2 to the fico range low.
Here's a snapshot of the relevant variables:
issue_d earliest_cr_line fico_range_high fico_range_low issue_d_date
1 Dec-2011 Jan-1985 739 735 2011-12-01
2 Dec-2011 Apr-1999 744 740 2011-12-01
3 Dec-2011 Nov-2001 739 735 2011-12-01
4 Dec-2011 Feb-1996 694 690 2011-12-01
5 Dec-2011 Jan-1996 699 695 2011-12-01
earliest_cr_line_date credit_history_length fico_mean
1 1985-01-01 9830 days 737
2 1999-04-01 4627 days 742
3 2001-11-01 3682 days 737
4 1996-02-01 5782 days 692
5 1996-01-01 5813 days 697
This should work:
library(tidyverse)
df <- structure(list(X.1 = 1:5, X = 1:5, id = c(1077501L, 1077430L,
1077175L, 1076863L, 1075358L), loan_amnt = c(5000L, 2500L, 2400L,
10000L, 3000L), term = c(" 36 months", " 60 months", " 36 months",
" 36 months", " 60 months"), grade = c("B", "C", "C", "C", "B"
), sub_grade = c("B2", "C4", "C5", "C1", "B5"), emp_length = c("10+ years",
"< 1 year", "10+ years", "10+ years", "1 year"), home_ownership = c("RENT",
"RENT", "RENT", "RENT", "RENT"), annual_inc = c(24000, 30000,
12252, 49200, 80000), verification_status = c("Verified", "Source Verified",
"Not Verified", "Source Verified", "Source Verified"), issue_d = c("Dec-2011",
"Dec-2011", "Dec-2011", "Dec-2011", "Dec-2011"), loan_status = c("Fully Paid",
"Charged Off", "Fully Paid", "Fully Paid", "Fully Paid"), purpose = c("credit_card",
"car", "small_business", "other", "other"), dti = c(27.65, 1,
8.72, 20, 17.94), delinq_2yrs = c(0L, 0L, 0L, 0L, 0L), earliest_cr_line = c("Jan-1985",
"Apr-1999", "Nov-2001", "Feb-1996", "Jan-1996"), fico_range_low = c(735L,
740L, 735L, 690L, 695L), fico_range_high = c(739L, 744L, 739L,
694L, 699L), inq_last_6mths = c(1L, 5L, 2L, 1L, 0L), mths_since_last_delinq = c(NA,
NA, NA, 35L, 38L), open_acc = c(3L, 3L, 2L, 10L, 15L), pub_rec = c(0L,
0L, 0L, 0L, 0L), revol_util = c("83.7%", "9.4%", "98.5%", "21%",
"53.9%"), total_acc = c(9L, 4L, 10L, 37L, 38L), acc_now_delinq = c(0L,
0L, 0L, 0L, 0L), chargeoff_within_12_mths = c(0L, 0L, 0L, 0L,
0L), delinq_amnt = c(0L, 0L, 0L, 0L, 0L), pub_rec_bankruptcies = c(0L,
0L, 0L, 0L, 0L), tax_liens = c(0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
5L), class = "data.frame")
df_new <- df %>% mutate(earliest_cr_line = parse_date(paste0("01-", earliest_cr_line), "%d-%b-%Y")) %>%
mutate(issue_d = parse_date(paste0("01-", issue_d), "%d-%b-%Y")) %>%
mutate(c_hist = (issue_d - earliest_cr_line)) %>%
mutate(c_hist_months = round(as.numeric(c_hist)/30, 0)) %>%
mutate(fico_mean = (fico_range_low + fico_range_high)/2) %>%
select(earliest_cr_line, issue_d, c_hist, c_hist_months, fico_mean)
df_new
#> earliest_cr_line issue_d c_hist c_hist_months fico_mean
#> 1 1985-01-01 2011-12-01 9830 days 328 737
#> 2 1999-04-01 2011-12-01 4627 days 154 742
#> 3 2001-11-01 2011-12-01 3682 days 123 737
#> 4 1996-02-01 2011-12-01 5782 days 193 692
#> 5 1996-01-01 2011-12-01 5813 days 194 697
Created on 2021-01-17 by the reprex package (v0.3.0)
I'm trying to change the color of my statebins map. I'm super new to R and still learning a lot, and googling around hasn't really helped.
This is what the sample looks like:
fipst stab state color
1 37 NC North Carolina 2
2 1 AL Alabama 2
3 28 MS Mississippi 2
4 5 AR Arkansas 2
5 47 TN Tennessee 2
6 45 SC South Carolina 1
7 23 ME Maine 2
49 32 NV Nevada 1
50 15 HI Hawaii 2
51 11 DC District of Columbia 2
digitaltax <- structure(list(fipst = c(37L, 1L, 28L, 5L, 47L, 45L, 23L, 32L,15L, 11L), stab = c("NC", "AL", "MS", "AR", "TN", "SC", "ME","NV", "HI", "DC"), state = c("North Carolina", "Alabama", "Mississippi","Arkansas", "Tennessee", "South Carolina", "Maine", "Nevada","Hawaii", "District of Columbia"), color = c(2L, 2L, 2L, 2L,2L, 1L, 2L, 1L, 2L, 2L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L,7L, 49L, 50L, 51L), class = "data.frame")
==X==============================================================X==
mutate(
digitaltax,
share = cut(color, breaks = 3, labels = c("No sales tax", "Exempts digital goods", "Taxes digital goods"))
) %>%
statebins(
value_col = "share", font_size = 2.5,
ggplot2_scale_function = scale_fill_brewer,
name = ""
) +
labs(title = "Which states tax digital products?") + theme_statebins()
This produces a map with a range of blues. How can I change the color? No matter what I've tried and found on google, it always throws this error:
Error in ggplot2_scale_function(...) : could not find function "ggplot2_scale_function"
Any help at all would be super appreciated. Thank you!
One approach is to use named RColorBrewer palates with brewer_pal =:
library(statebins)
statebins(digitaltax,
value_col = "color",
breaks = length(unique(digitaltax$share)),
labels = unique(digitaltax$share),
brewer_pal = "Dark2") +
labs(title = "Which states tax digital products?")
Execute this command to see all palates:
library(RColorBrewer)
display.brewer.all()
With your data and most of your code I did change DC color to 3 so it shows all categories.
library(dplyr)
library(ggplot2)
library(statebins)
# changes DC to be color 3
digitaltax <- structure(list(fipst = c(37L, 1L, 28L, 5L, 47L, 45L, 23L, 32L,15L, 11L), stab = c("NC", "AL", "MS", "AR", "TN", "SC", "ME","NV", "HI", "DC"), state = c("North Carolina", "Alabama", "Mississippi","Arkansas", "Tennessee", "South Carolina", "Maine", "Nevada","Hawaii", "District of Columbia"), color = c(2L, 2L, 2L, 2L,2L, 1L, 2L, 1L, 2L, 3L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L,7L, 49L, 50L, 51L), class = "data.frame")
mutate(
digitaltax,
share = cut(color, breaks = 3, labels = c("No sales tax", "Exempts digital goods", "Taxes digital goods"))
) %>%
statebins(
value_col = "share", font_size = 2.5,
ggplot2_scale_function = scale_fill_brewer,
name = ""
) +
labs(title = "Which states tax digital products?") +
theme_statebins()
Created on 2020-05-12 by the reprex package (v0.3.0)
I have a geom_area plot that looks like this:
The x-axis is a time serie, and i want to color the fill of each facet by groups of the variable "estacion" (seasons of the year). Here's a sample of my data:
año censo estacion tipoEuro censEu censTot pCensEu
2010 2010-01-01 Invierno HA frisona 13 32 40.62500
2010 2010-01-01 Invierno Bovinos jovenes 10 32 31.25000
2010 2010-01-02 Invierno HA frisona 13 32 40.62500
---
2014 2014-12-30 Invierno Bovinos jovenes 15 26 57.69231
2014 2014-12-31 Invierno HA frisona 3 26 11.53846
2014 2014-12-31 Invierno Terneros 8 26 30.76923
Here's the code I'm using to make the plot:
ggplot(censTot1,aes(x=censo, y=pCensEu,group=tipoEuro)) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)
and this is the code i intend to use, and the error generated:
ggplot(censTot1,aes(x=censo,y=pCensEu,group=tipoEuro,fill=estacion)) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)
Error: Aesthetics can not vary with a ribbon
I'm not sure about the desired output. Would this solve your problem?
library(ggplot2)
ggplot(df, aes(x=censo, y=pCensEu,group=tipoEuro))+
geom_area(aes(fill=estacion))+
geom_line()+
facet_grid(tipoEuro ~ .)
The data used
df <- structure(list(año = c(2010L, 2010L, 2010L, 2014L, 2014L, 2014L
), censo = structure(c(1L, 1L, 2L, 3L, 4L, 4L), .Label = c("01/01/2010",
"02/01/2010", "30/12/2014", "31/12/2014"), class = "factor"),
estacion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Invierno", class = "factor"),
tipoEuro = structure(c(2L, 1L, 2L, 1L, 2L, 3L), .Label = c("Bovinos jovenes",
"HA frisona", "Terneros"), class = "factor"), censEu = c(13L,
10L, 13L, 15L, 3L, 8L), censTot = c(32L, 32L, 32L, 26L, 26L,
26L), pCensEu = c(40.625, 31.25, 40.625, 57.69231, 11.53846,
30.76923)), .Names = c("año", "censo", "estacion", "tipoEuro",
"censEu", "censTot", "pCensEu"), class = "data.frame", row.names = c(NA,
-6L))
You can also try this:
ggplot(df, aes(x=censo, y=pCensEu, color=estacion, group=interaction(tipoEuro,estacion))) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)