Related
I'm a newbie for using Rstudio, so I've some problems I want to ask.
I want to make my scalebar for species composition in 10 sites, and add the number inside the scalebar.
The result like this.
I want to put the frequency number of species composition inside the scale bar. I've been tried to put code of geom_text, but the result is not appropriate at all.
I hope there's an answer to fix this. Thank you so much.
Here is my data, also the coding that I run in R.
data <- as.matrix(data.frame(Bng = c(0, 0, 0, 41, 0, 9, 6, 25, 11, 2, 5, 7),
Krs = c(0, 25, 0, 82, 0, 0, 0, 0, 23, 0, 0, 0),
Bny = c(0, 0, 0, 0, 0, 0, 0, 23, 16, 0, 10, 0),
Kmb = c(1, 0, 0, 0, 20, 0, 0, 25, 8, 1, 0, 0),
Sgk = c(0, 0, 0, 18, 0, 2, 0, 11, 0, 0, 0, 0),
Lwb = c(1, 0, 2, 73, 0, 5, 0, 7, 5, 0, 0, 0),
Lws = c(0, 0, 0, 4, 0, 0, 0, 4, 0, 4, 1, 0),
Krp = c(0, 0, 0, 115, 0, 0, 2, 0, 2, 0, 0, 0),
Hrt = c(4, 0, 0, 0, 2, 22, 0, 7, 4, 2, 3, 0),
Gmb = c(0, 2, 0, 42, 2, 0, 0, 1, 6, 4, 3, 0)))
rownames(data) <- c("Cbr", "Csx", "Rax", "Hdd", "Hlv", "Mst", "Mps", "Mbr", "Rfs", "Rbn", "Rct", "Rps")
data
barplot(data)
barplot(prop.table(data, 2))```
library(reshape2)
data_long <- as.data.frame(data)
data_long$subgroup <- rownames(data_long)
data_long <- melt(data_long, id.vars = "subgroup")
library(ggplot2)
ggp <- ggplot(data_long,
aes(x = variable,
y = value,
fill = subgroup)) +
geom_bar(position = "fill", stat = "identity")+
theme_bw()+
scale_fill_grey()
ggp
ggp +
scale_y_continuous(labels = scales::percent_format())
You may try
library(dplyr)
data_long %>%
group_by(subgroup) %>%
mutate(key = sum(value),
value = value/sum(value)
) %>%
filter(value != 0) %>%
ggplot(aes(x = variable,
y = value,
fill = subgroup)) +
geom_bar(position = "fill", stat = "identity")+
theme_bw()+
scale_fill_grey() +
scale_y_continuous(labels = scales::percent_format()) +
geom_text(aes(label = value * key), position = position_fill(vjust = .5))
The first five entries (out of twenty) of my dataset:
>head(data)
Name SDC
<chr> <Period>
1 Feuerman 1M 37S
2 Solis 1M 52S
3 Osborne 1M 47S
4 Frizzell 1M 58S
5 Moran 1M 59S
Also:
> dput(head(data))
structure(list(Name = c("Feuerman", "Solis", "Osborne", "Frizzell",
"Moran", "Seth"), Deadlift = c(320, 250, 340, 250, 250, 200),
Medicine_Ball = c(11.6, 8.8, 12.5, 9.2, 9.7, 9.1), HRP = c(46,
39, 36, 33, 42, 31), SDC = new("Period", .Data = c(37, 52,
47, 58, 59, 15), year = c(0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0), hour = c(0, 0,
0, 0, 0, 0), minute = c(1, 1, 1, 1, 1, 2)), Leg_Tuck = c(20,
13, 4, 10, 13, 13), Run = new("Period", .Data = c(48, 59,
10, 53, 0, 29), year = c(0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0), hour = c(0, 0,
0, 0, 0, 0), minute = c(13, 12, 17, 16, 0, 16)), Total = c(570,
508, 513, 470, 410, 452), Pass_Fail = structure(c(1L, 1L,
2L, 1L, 2L, 1L), .Label = c("Pass", "Fail"), class = "factor"),
Date = structure(c(18522, 18522, 18522, 18522, 18522, 18522
), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
As you can see, SDC is in minutes:seconds format. I achieved this using ms(data$SDC) to change the column type. I am trying to create a plot using geom_col that orders SDC from lowest to highest times. I am facing two problems:
When using the reorder command, the times are not properly reordered (see plot below).
The axes are being formatted by hour:minute:second but I want it to be formatted in only minute:second format (also see plot below).
Here is my code to generate the plot:
ggplot(data=data,
aes(x=reorder(Name, -SDC), y=SDC, fill=Pass_Fail)) +
scale_y_time(limits=c(0,200)) +
scale_fill_manual(values=c('#00BFC4', '#F8766D')) +
labs(x='Soldier', y='Sprint Drag Carry Time', fill='Passed/Failed ACFT', title='Sprint Drag Carry Scores') +
geom_col() +
geom_text(size=3, aes(label = SDC), hjust=-0.04) +
coord_flip() +
theme_classic()
It produces the following plot:
As you can see, the reordering is incorrect and the axes are not formatted the way I want them to be. Thanks in advance for your help.
I think reorder have trouble working with Period object. We can arrange the factor levels according to the value of SDC to get bars in increasing orders.
We can pass custom function for y-axis to get only minutes and seconds in labels.
library(tidyverse)
data %>%
arrange(SDC) %>%
mutate(Name = factor(Name, levels = unique(Name))) %>%
ggplot() + aes(x=Name, y=SDC, fill=Pass_Fail) +
scale_y_time(limits=c(0,200),
labels = function(x) sprintf('%02s:%02s', minute(x),second(x))) +
scale_fill_manual(values=c('#00BFC4', '#F8766D')) +
labs(x='Soldier', y='Sprint Drag Carry Time',
fill='Passed/Failed ACFT', title='Sprint Drag Carry Scores') +
geom_col() +
geom_text(size=3, aes(label = SDC), hjust=-0.04) +
coord_flip() +
theme_classic()
I'm trying to use ggplot, and am hoping to create a boxplot that has four categories on the x axis for suspension data (low, lowish, highish, high) and farms on the y-axis.
I have I think broken the suspension column into four groups. But ggplot is upset with me. Here is the error:
```
Error in if (is.double(data$x) && !has_groups(data) && any(data$x != data$x[1L])) { : missing value where TRUE/FALSE needed
```
Here is my code:
```{r}
# To break suspension_rate_total_pct data into groups for clearer visualization, I found the min, and max
merged_data$suspension_rate_total_pct <-
as.numeric(merged_data$suspension_rate_total_pct)
max(merged_data$suspension_rate_total_pct, na.rm=TRUE)
min(merged_data$suspension_rate_total_pct, na.rm=TRUE)
low_suspension <- merged_data$suspension_rate_total_pct > 0 & merged_data$suspension_rate_total_pct < 0.5
low_ish_suspension <- merged_data$suspension_rate_total_pct > 0.5 & merged_data$suspension_rate_total_pct < 1
high_ish_suspension <- merged_data$suspension_rate_total_pct > 1 & merged_data$suspension_rate_total_pct < 1.5
high_suspension <- merged_data$suspension_rate_total_pct > 1.5 & merged_data$suspension_rate_total_pct < 2
ggplot(merged_data, aes(x = suspension_rate_total_pct , y = farms_pct)) +
geom_boxplot()
```
Here is the Data:
merged_data <- structure(list(schid = c("1030642", "1030766", "1030774", "1030840",
"1130103", "1230150"), enrollment = c(159, 333, 352, 430, 102,
193), farms = c(132, 116, 348, 406, 68, 130), foster = c(2, 0,
1, 8, 1, 4), homeless = c(14, 0, 8, 4, 1, 4), migrant = c(0,
0, 0, 0, 0, 0), ell = c(18, 12, 114, 45, 7, 4), suspension_rate_total = c(NA,
20, 0, 0, 95, 5), suspension_violent = c(NA, 9, 0, 0, 20, 2),
suspension_violent_no_injury = c(NA, 6, 0, 0, 47, 1), suspension_weapon = c(NA,
0, 0, 0, 8, 0), suspension_drug = c(NA, 0, 0, 0, 9, 1), suspension_defiance = c(NA,
1, 0, 0, 9, 1), suspension_other = c(NA, 4, 0, 0, 2, 0),
farms_pct = c(0.830188679245283, 0.348348348348348, 0.988636363636364,
0.944186046511628, 0.666666666666667, 0.673575129533679),
foster_pct = c(0.0125786163522013, 0, 0.00284090909090909,
0.0186046511627907, 0.00980392156862745, 0.0207253886010363
), migrant_pct = c(0, 0, 0, 0, 0, 0), ell_pct = c(0.113207547169811,
0.036036036036036, 0.323863636363636, 0.104651162790698,
0.0686274509803922, 0.0207253886010363), homeless_pct = c(0.0880503144654088,
0, 0.0227272727272727, 0.00930232558139535, 0.00980392156862745,
0.0207253886010363), suspension_rate_total_pct = c(NA, 2,
1, 1, 2, 2)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
If you can, please help me appease ggplot so that it will give me with beautiful visualization. Currently, this feels like a one-sided, emotional rollercoaster of a relationship.
Just a short answer, i am sure you can figure out the rest by yourself, (otherwise post a followup question.)
Since the data you provided has some NA's in the first row in several columns, i can only demonstrate you the principle on how to get your desired result by using the merged_data$homless value as group-input for our boxplots , the data (y-value) will be still Farms .
# first we create our groups of low, middle & high amount of homeless
merged_data2<- merged_data %>% mutate(homelessgroup= ifelse(homeless < 4, "low",
ifelse(homeless <= 8, "middle",
ifelse(homeless > 8, "high",NA ))))
## then we plot the data using ggplot
ggplot(merged_data2,aes(y=farms,fill=homelessgroup))+geom_boxplot()
I think you can just use cut() with your data to partition into 4 groups. Then you can use that variable with the plot
merged_data <- transform(merged_data,
group = cut(
suspension_rate_total_pct,
c(0, .5, 1, 1.5, 2),
include.lowest = TRUE,
labels = c("low", "lowish", "highish", "high")))
ggplot(merged_data, aes(x = group , y = farms_pct)) +
geom_boxplot()
I am trying to display a data frame using kable function. The dataframe consists of 29 columns but only 9 columns are displayed and the remaining columns are repeated.
The dataframe used is
structure(list(Name = c("Grand Total", "B", "C", "D", "E", "F"
), GrandTotal = c(3416, 297, 410, 326, 125, 29), English = c(1096,
18, 64, 0, 55, 0), Science = c(211, 5, 39, 0, 55, 0), Language = c(149,
5, 0, 0, 10, 0), Maths = c(22, 0, 0, 0, 0, 0), Social = c(0,
0, 0, 0, 0, 0), English = c(211, 5, 39, 0, 55, 0), Science = c(149,
5, 0, 0, 10, 0), Maths = c(0, 0, 0, 0, 0, 0), Social = c(22,
0, 0, 0, 0, 0), English = c(1096, 18, 64, 0, 55, 0), Science = c(211,
5, 39, 0, 55, 0), Language = c(149, 5, 0, 0, 10, 0), Maths = c(22,
0, 0, 0, 0, 0), Social = c(0, 0, 0, 0, 0, 0), English = c(211,
5, 39, 0, 55, 0), Science = c(149, 5, 0, 0, 10, 0), ACIntern = c(0,
0, 0, 0, 0, 0), PAM = c(22, 0, 0, 0, 0, 0), Maths = c(1096, 18,
64, 0, 55, 0), Social = c(211, 5, 39, 0, 55, 0), English = c(149,
5, 0, 0, 10, 0), Science = c(22, 0, 0, 0, 0, 0), Language = c(0,
0, 0, 0, 0, 0), Maths = c(211, 5, 39, 0, 55, 0), Social = c(149,
5, 0, 0, 10, 0), English = c(0, 0, 0, 0, 0, 0), Science = c(22,
0, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")
The code used for displaying the data frame as a table format is as follows
monthSelected <- c("April","May","June")
month1 <- paste0(monthSelected[1],' ',yearSelected)
month2 <- paste0(monthSelected[2],' ',yearSelected)
month3 <- paste0(monthSelected[3],' ',yearSelected)
myHeader <- c(" " = 2, month1 = 9, month2 = 9, month3 = 9)
names(myHeader) <- c(" ", month1, month2, month3)
kable(df[1:ncol(df)],"html") %>%
kable_styling(c("striped", "bordered")) %>%
add_header_above(c(" "=2, "IND" = 5, "US" = 4,"IND" = 5, "US" = 4,"IND" = 5, "US" = 4)) %>%
add_header_above(header = myHeader)
The output displayed is as follows
I can't figure out where I went wrong. Can anyone help me out with this issue?
In addition to it, is it possible to freeze first two columns when the table is scrolled horizontally?
Thanks in advance!!
I've created a decision tree using rpart and the code below:
res.tree <- rpart(myformula, data = credit_train)
my data has been subset into 2 parts. The training part at 70% and a testing part at 30%.
This part works well and my tree is created. Where I'm getting stuck is with the prediction so that I can calculate my confusion matrix and ROC curves.
I'm using this code tree_pred = predict(res.tree, credit_train, type = "class")
but I get this message:
Error in predict.rpart(res.tree, credit_test, type = "class") : Invalid prediction for "rpart" object
In addition:
Warning message:
'newdata' had 271 rows but variables found have 729 rows
I can't figure out if I don't have a library loaded or what is causing the it not to recognize the type, which is what so many resources say I need to use and why I'm getting a mismatch in the rows.
The 'newdata' at 271 rows is what my testing data set has and my training data-set has 729 rows.
Is the decision tree creation causing my problem or could it be the prediction code?
Responding to comments:
I'm using the following libraries:
library(readxl)
library(dplyr)
library(factoextra)
library(corrplot)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(pROC)
library(Hmisc)
library(fBasics)
library(rattle)
library(caret)
A sample of my data:
structure(list(CHK_ACCT = c(0, 1, 0, 0), DURATION = c(6, 48,
42, 24), HISTORY = c(4, 2, 2, 3), NEW_CAR = c(0, 0, 0, 1), USED_CAR = c(0,
0, 0, 0), FURNITURE = c(0, 0, 1, 0), `RADIO/TV` = c(1, 1, 0,
0), EDUCATION = c(0, 0, 0, 0), RETRAINING = c(0, 0, 0, 0), AMOUNT = c(1169,
5951, 7882, 4870), SAV_ACCT = c(4, 0, 0, 0), EMPLOYMENT = c(4,
2, 3, 2), INSTALL_RATE = c(4, 2, 2, 3), MALE_DIV = c(0, 0, 0,
0), MALE_SINGLE = c(1, 0, 1, 1), MALE_MAR_or_WID = c(0, 0, 0,
0), `CO-APPLICANT` = c(0, 0, 0, 0), GUARANTOR = c(0, 0, 1, 0),
PRESENT_RESIDENT = c(4, 2, 4, 4), REAL_ESTATE = c(1, 1, 0,
0), PROP_UNKN_NONE = c(0, 0, 0, 1), AGE = c(67, 22, 45, 53
), OTHER_INSTALL = c(0, 0, 0, 0), RENT = c(0, 0, 0, 0), OWN_RES = c(1,
1, 0, 0), NUM_CREDITS = c(2, 1, 1, 2), JOB = c(2, 2, 2, 2
), NUM_DEPENDENTS = c(1, 1, 2, 2), TELEPHONE = c(1, 0, 0,
0), FOREIGN = c(0, 0, 0, 0), DEFAULT = c(0, 1, 0, 1), CHK_ACCT_rec = c(1,
2, 1, 1), SAV_ACCT_rec = c(0, 1, 1, 1)), .Names = c("CHK_ACCT",
"DURATION", "HISTORY", "NEW_CAR", "USED_CAR", "FURNITURE", "RADIO/TV",
"EDUCATION", "RETRAINING", "AMOUNT", "SAV_ACCT", "EMPLOYMENT",
"INSTALL_RATE", "MALE_DIV", "MALE_SINGLE", "MALE_MAR_or_WID",
"CO-APPLICANT", "GUARANTOR", "PRESENT_RESIDENT", "REAL_ESTATE",
"PROP_UNKN_NONE", "AGE", "OTHER_INSTALL", "RENT", "OWN_RES",
"NUM_CREDITS", "JOB", "NUM_DEPENDENTS", "TELEPHONE", "FOREIGN",
"DEFAULT", "CHK_ACCT_rec", "SAV_ACCT_rec"), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
myformula = credit_train$DEFAULT ~ credit_train$CHK_ACCT_rec +
credit_train$DURATION + credit_train$HISTORY + credit_train$NEW_CAR +
credit_train$USED_CAR + credit_train$FURNITURE + credit_train$`RADIO/TV` +
credit_train$EDUCATION + credit_train$RETRAINING + credit_train$AMOUNT +
credit_train$SAV_ACCT_rec + credit_train$EMPLOYMENT +
credit_train$INSTALL_RATE + credit_train$MALE_DIV + credit_train$MALE_SINGLE
+ credit_train$MALE_MAR_or_WID + credit_train$`CO-APPLICANT` +
credit_train$GUARANTOR + credit_train$PRESENT_RESIDENT +
credit_train$REAL_ESTATE + credit_train$PROP_UNKN_NONE + credit_train$AGE +
credit_train$OTHER_INSTALL + credit_train$RENT + credit_train$OWN_RES +
credit_train$NUM_CREDITS + credit_train$JOB + credit_train$NUM_DEPENDENTS +
credit_train$TELEPHONE + credit_train$FOREIGN
#calimo I hope this is what you needed.