Format numbers on axes and tooltips in ggplotly - r

I have used sprintf and formatC to take a double value and round it to two decimal places. However, when I go to use it in ggplot and ggplotly, it makes my visuals act out.
Dput:
structure(list(Date = structure(c(18328, 18329, 18330, 18331,
18332, 18333), class = "Date"), State = c("Louisiana", "Louisiana",
"Louisiana", "Louisiana", "Louisiana", "Louisiana"), variablename1 = c(0,
0, 1, 1, 6, 14), variablename2 = c(5, 5, 5, 11, 37, 37), death = c(0,
0, 0, 0, 0, 0), variablename3 = c(5, 5, 6, 12, 43, 51), variablename4 = c(0,
0, 0, 0, 0, 0), variablename5 = c(0, 0, 0, 0, 0, 0), variablename6 = c(0,
0, 0, 6, 26, 0), variablename7 = c(0, 0, 1, 0, 5, 8), variablename8 = c(0,
5, 1, 6, 31, 8), Percent = c(0, 0, 16.6666666666667, 8.33333333333333,
13.953488372093, 27.4509803921569)), row.names = c(NA, -6L), groups = structure(list(
State = "Louisiana", .rows = list(1:6)), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Edit: In my quest of finding tricks, I have found the 'non-short' way of doing things more efficient. Just use round(variable, 2) to round a value to the second decimal place and use that. For now.

This can be easily achieved via scales::percent or scales::percent_format. As an example I made a line plot of Percent by Date where I use scales::percent_format to format y-axis labels and scales::percent to format the percent value in the tooltip:
library(ggplot2)
library(plotly)
p <- df %>%
ungroup() %>%
ggplot(aes(Date, Percent, color = State, group = State,
text = sprintf("State: %s<br>Date: %s<br>Percent: %s",
State, Date, scales::percent(Percent, scale = 1, accuracy = .01)))) +
geom_line() +
scale_y_continuous(labels = scales::percent_format(scale = 1, accuracy = .01))
ggplotly(p, tooltip = 'text')

Related

ggplot2 Reorder Data that is in minutes:seconds Format

The first five entries (out of twenty) of my dataset:
>head(data)
Name SDC
<chr> <Period>
1 Feuerman 1M 37S
2 Solis 1M 52S
3 Osborne 1M 47S
4 Frizzell 1M 58S
5 Moran 1M 59S
Also:
> dput(head(data))
structure(list(Name = c("Feuerman", "Solis", "Osborne", "Frizzell",
"Moran", "Seth"), Deadlift = c(320, 250, 340, 250, 250, 200),
Medicine_Ball = c(11.6, 8.8, 12.5, 9.2, 9.7, 9.1), HRP = c(46,
39, 36, 33, 42, 31), SDC = new("Period", .Data = c(37, 52,
47, 58, 59, 15), year = c(0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0), hour = c(0, 0,
0, 0, 0, 0), minute = c(1, 1, 1, 1, 1, 2)), Leg_Tuck = c(20,
13, 4, 10, 13, 13), Run = new("Period", .Data = c(48, 59,
10, 53, 0, 29), year = c(0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0), hour = c(0, 0,
0, 0, 0, 0), minute = c(13, 12, 17, 16, 0, 16)), Total = c(570,
508, 513, 470, 410, 452), Pass_Fail = structure(c(1L, 1L,
2L, 1L, 2L, 1L), .Label = c("Pass", "Fail"), class = "factor"),
Date = structure(c(18522, 18522, 18522, 18522, 18522, 18522
), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
As you can see, SDC is in minutes:seconds format. I achieved this using ms(data$SDC) to change the column type. I am trying to create a plot using geom_col that orders SDC from lowest to highest times. I am facing two problems:
When using the reorder command, the times are not properly reordered (see plot below).
The axes are being formatted by hour:minute:second but I want it to be formatted in only minute:second format (also see plot below).
Here is my code to generate the plot:
ggplot(data=data,
aes(x=reorder(Name, -SDC), y=SDC, fill=Pass_Fail)) +
scale_y_time(limits=c(0,200)) +
scale_fill_manual(values=c('#00BFC4', '#F8766D')) +
labs(x='Soldier', y='Sprint Drag Carry Time', fill='Passed/Failed ACFT', title='Sprint Drag Carry Scores') +
geom_col() +
geom_text(size=3, aes(label = SDC), hjust=-0.04) +
coord_flip() +
theme_classic()
It produces the following plot:
As you can see, the reordering is incorrect and the axes are not formatted the way I want them to be. Thanks in advance for your help.
I think reorder have trouble working with Period object. We can arrange the factor levels according to the value of SDC to get bars in increasing orders.
We can pass custom function for y-axis to get only minutes and seconds in labels.
library(tidyverse)
data %>%
arrange(SDC) %>%
mutate(Name = factor(Name, levels = unique(Name))) %>%
ggplot() + aes(x=Name, y=SDC, fill=Pass_Fail) +
scale_y_time(limits=c(0,200),
labels = function(x) sprintf('%02s:%02s', minute(x),second(x))) +
scale_fill_manual(values=c('#00BFC4', '#F8766D')) +
labs(x='Soldier', y='Sprint Drag Carry Time',
fill='Passed/Failed ACFT', title='Sprint Drag Carry Scores') +
geom_col() +
geom_text(size=3, aes(label = SDC), hjust=-0.04) +
coord_flip() +
theme_classic()

Error in if (is.double(data$x) && !has_groups(data) && any(data$x != data$x[1L])) { : missing value where TRUE/FALSE needed

I'm trying to use ggplot, and am hoping to create a boxplot that has four categories on the x axis for suspension data (low, lowish, highish, high) and farms on the y-axis.
I have I think broken the suspension column into four groups. But ggplot is upset with me. Here is the error:
```
Error in if (is.double(data$x) && !has_groups(data) && any(data$x != data$x[1L])) { : missing value where TRUE/FALSE needed
```
Here is my code:
```{r}
# To break suspension_rate_total_pct data into groups for clearer visualization, I found the min, and max
merged_data$suspension_rate_total_pct <-
as.numeric(merged_data$suspension_rate_total_pct)
max(merged_data$suspension_rate_total_pct, na.rm=TRUE)
min(merged_data$suspension_rate_total_pct, na.rm=TRUE)
low_suspension <- merged_data$suspension_rate_total_pct > 0 & merged_data$suspension_rate_total_pct < 0.5
low_ish_suspension <- merged_data$suspension_rate_total_pct > 0.5 & merged_data$suspension_rate_total_pct < 1
high_ish_suspension <- merged_data$suspension_rate_total_pct > 1 & merged_data$suspension_rate_total_pct < 1.5
high_suspension <- merged_data$suspension_rate_total_pct > 1.5 & merged_data$suspension_rate_total_pct < 2
ggplot(merged_data, aes(x = suspension_rate_total_pct , y = farms_pct)) +
geom_boxplot()
```
Here is the Data:
merged_data <- structure(list(schid = c("1030642", "1030766", "1030774", "1030840",
"1130103", "1230150"), enrollment = c(159, 333, 352, 430, 102,
193), farms = c(132, 116, 348, 406, 68, 130), foster = c(2, 0,
1, 8, 1, 4), homeless = c(14, 0, 8, 4, 1, 4), migrant = c(0,
0, 0, 0, 0, 0), ell = c(18, 12, 114, 45, 7, 4), suspension_rate_total = c(NA,
20, 0, 0, 95, 5), suspension_violent = c(NA, 9, 0, 0, 20, 2),
suspension_violent_no_injury = c(NA, 6, 0, 0, 47, 1), suspension_weapon = c(NA,
0, 0, 0, 8, 0), suspension_drug = c(NA, 0, 0, 0, 9, 1), suspension_defiance = c(NA,
1, 0, 0, 9, 1), suspension_other = c(NA, 4, 0, 0, 2, 0),
farms_pct = c(0.830188679245283, 0.348348348348348, 0.988636363636364,
0.944186046511628, 0.666666666666667, 0.673575129533679),
foster_pct = c(0.0125786163522013, 0, 0.00284090909090909,
0.0186046511627907, 0.00980392156862745, 0.0207253886010363
), migrant_pct = c(0, 0, 0, 0, 0, 0), ell_pct = c(0.113207547169811,
0.036036036036036, 0.323863636363636, 0.104651162790698,
0.0686274509803922, 0.0207253886010363), homeless_pct = c(0.0880503144654088,
0, 0.0227272727272727, 0.00930232558139535, 0.00980392156862745,
0.0207253886010363), suspension_rate_total_pct = c(NA, 2,
1, 1, 2, 2)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
If you can, please help me appease ggplot so that it will give me with beautiful visualization. Currently, this feels like a one-sided, emotional rollercoaster of a relationship.
Just a short answer, i am sure you can figure out the rest by yourself, (otherwise post a followup question.)
Since the data you provided has some NA's in the first row in several columns, i can only demonstrate you the principle on how to get your desired result by using the merged_data$homless value as group-input for our boxplots , the data (y-value) will be still Farms .
# first we create our groups of low, middle & high amount of homeless
merged_data2<- merged_data %>% mutate(homelessgroup= ifelse(homeless < 4, "low",
ifelse(homeless <= 8, "middle",
ifelse(homeless > 8, "high",NA ))))
## then we plot the data using ggplot
ggplot(merged_data2,aes(y=farms,fill=homelessgroup))+geom_boxplot()
I think you can just use cut() with your data to partition into 4 groups. Then you can use that variable with the plot
merged_data <- transform(merged_data,
group = cut(
suspension_rate_total_pct,
c(0, .5, 1, 1.5, 2),
include.lowest = TRUE,
labels = c("low", "lowish", "highish", "high")))
ggplot(merged_data, aes(x = group , y = farms_pct)) +
geom_boxplot()

Ggplot2: plot avgSessionDuration from GA as a bar plot? Formal class 'Period' [package "lubridate"]

I've a df with months and Avg. Session Duration from Google Analytics.
Definition of Avg. Session Duration: The average duration (in seconds) of users' sessions.
This column I've converted from seconds to minutes using lubridate:
library(lubridate)
df$avgSessionDuration <- seconds_to_period(df$avgSessionDuration)
But when I try to plot this as a simple bar chart, I'm getting and empty plot:
ggplot(df, aes(x=mes, y = avgSessionDuration, label = avgSessionDuration)) +
geom_bar(stat = "identity", position = "dodge")
dput:
structure(list(mes = c("Apr", "Aug", "Dec", "Feb", "Jan", "Jul",
"Jun", "Mar", "May", "Nov", "Oct", "Sep"), avgSessionDuration = new("Period",
.Data = c(40.3422263169386, 15.0267821554301, 58.2952632067657,
23.8959081801318, 48.6127812066904, 52.4427333243062, 28.2065306406772,
7.08722136893016, 53.9021568123918, 6.21260764666937, 45.1247832993034,
35.057865957604), year = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), day = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), hour = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), minute = c(6, 7, 6, 11, 12, 6, 7, 7,
6, 8, 7, 7))), row.names = c(NA, -12L), totals = list(list(
sessions = "52429423", sessionDuration = "2.6206981539E10",
avgSessionDuration = "499.8525644464941")), minimums = list(
list(sessions = "45875", sessionDuration = "1.6438276E7",
avgSessionDuration = "264.72602881964076")), maximums = list(
list(sessions = "932025", sessionDuration = "6.17676652E8",
avgSessionDuration = "981.1245110604924")), isDataGolden = TRUE, rowCount = 365L, class = c("tbl_df",
"tbl", "data.frame"))
You can use the following code
library(tidyverse)
df %>% separate(avgSessionDuration, c("A", "B")) %>% ggplot(aes(x=mes, y = A, label = A)) +
geom_bar(stat = "identity", position = "dodge")

Decision tree and error matrix calculations

I've created a decision tree using rpart and the code below:
res.tree <- rpart(myformula, data = credit_train)
my data has been subset into 2 parts. The training part at 70% and a testing part at 30%.
This part works well and my tree is created. Where I'm getting stuck is with the prediction so that I can calculate my confusion matrix and ROC curves.
I'm using this code tree_pred = predict(res.tree, credit_train, type = "class")
but I get this message:
Error in predict.rpart(res.tree, credit_test, type = "class") : Invalid prediction for "rpart" object
In addition:
Warning message:
'newdata' had 271 rows but variables found have 729 rows
I can't figure out if I don't have a library loaded or what is causing the it not to recognize the type, which is what so many resources say I need to use and why I'm getting a mismatch in the rows.
The 'newdata' at 271 rows is what my testing data set has and my training data-set has 729 rows.
Is the decision tree creation causing my problem or could it be the prediction code?
Responding to comments:
I'm using the following libraries:
library(readxl)
library(dplyr)
library(factoextra)
library(corrplot)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(pROC)
library(Hmisc)
library(fBasics)
library(rattle)
library(caret)
A sample of my data:
structure(list(CHK_ACCT = c(0, 1, 0, 0), DURATION = c(6, 48,
42, 24), HISTORY = c(4, 2, 2, 3), NEW_CAR = c(0, 0, 0, 1), USED_CAR = c(0,
0, 0, 0), FURNITURE = c(0, 0, 1, 0), `RADIO/TV` = c(1, 1, 0,
0), EDUCATION = c(0, 0, 0, 0), RETRAINING = c(0, 0, 0, 0), AMOUNT = c(1169,
5951, 7882, 4870), SAV_ACCT = c(4, 0, 0, 0), EMPLOYMENT = c(4,
2, 3, 2), INSTALL_RATE = c(4, 2, 2, 3), MALE_DIV = c(0, 0, 0,
0), MALE_SINGLE = c(1, 0, 1, 1), MALE_MAR_or_WID = c(0, 0, 0,
0), `CO-APPLICANT` = c(0, 0, 0, 0), GUARANTOR = c(0, 0, 1, 0),
PRESENT_RESIDENT = c(4, 2, 4, 4), REAL_ESTATE = c(1, 1, 0,
0), PROP_UNKN_NONE = c(0, 0, 0, 1), AGE = c(67, 22, 45, 53
), OTHER_INSTALL = c(0, 0, 0, 0), RENT = c(0, 0, 0, 0), OWN_RES = c(1,
1, 0, 0), NUM_CREDITS = c(2, 1, 1, 2), JOB = c(2, 2, 2, 2
), NUM_DEPENDENTS = c(1, 1, 2, 2), TELEPHONE = c(1, 0, 0,
0), FOREIGN = c(0, 0, 0, 0), DEFAULT = c(0, 1, 0, 1), CHK_ACCT_rec = c(1,
2, 1, 1), SAV_ACCT_rec = c(0, 1, 1, 1)), .Names = c("CHK_ACCT",
"DURATION", "HISTORY", "NEW_CAR", "USED_CAR", "FURNITURE", "RADIO/TV",
"EDUCATION", "RETRAINING", "AMOUNT", "SAV_ACCT", "EMPLOYMENT",
"INSTALL_RATE", "MALE_DIV", "MALE_SINGLE", "MALE_MAR_or_WID",
"CO-APPLICANT", "GUARANTOR", "PRESENT_RESIDENT", "REAL_ESTATE",
"PROP_UNKN_NONE", "AGE", "OTHER_INSTALL", "RENT", "OWN_RES",
"NUM_CREDITS", "JOB", "NUM_DEPENDENTS", "TELEPHONE", "FOREIGN",
"DEFAULT", "CHK_ACCT_rec", "SAV_ACCT_rec"), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
myformula = credit_train$DEFAULT ~ credit_train$CHK_ACCT_rec +
credit_train$DURATION + credit_train$HISTORY + credit_train$NEW_CAR +
credit_train$USED_CAR + credit_train$FURNITURE + credit_train$`RADIO/TV` +
credit_train$EDUCATION + credit_train$RETRAINING + credit_train$AMOUNT +
credit_train$SAV_ACCT_rec + credit_train$EMPLOYMENT +
credit_train$INSTALL_RATE + credit_train$MALE_DIV + credit_train$MALE_SINGLE
+ credit_train$MALE_MAR_or_WID + credit_train$`CO-APPLICANT` +
credit_train$GUARANTOR + credit_train$PRESENT_RESIDENT +
credit_train$REAL_ESTATE + credit_train$PROP_UNKN_NONE + credit_train$AGE +
credit_train$OTHER_INSTALL + credit_train$RENT + credit_train$OWN_RES +
credit_train$NUM_CREDITS + credit_train$JOB + credit_train$NUM_DEPENDENTS +
credit_train$TELEPHONE + credit_train$FOREIGN
#calimo I hope this is what you needed.

ggplot2: how do I add a second plot line

Ok,
[R3.4.2 + ggplot2]
Using the data example listed below, how do I add a second data plot? I tried this example which I found on this site;
library(ggplot2]
** This is part of the origanl code ****
rpt<-read.csv(file="rpt.csv,header=T)
rpt1<-read.csv(file="rpt1.csv,header=T)
*** code starts here *****
ggplot(rpt,aes(JulianDate,w)) + geom_line(aes(color="First line")) +
geom_line(data=rpt1, aes(color="Second line")) + labs(color="Legend text")
The first plot has x=rpt$JulianDate, y=rpt1$w; and the second plot has x1= rpt1$JDAy and y2=rpt1$wolf)
The data (use dget(_) to read it):
structure(list(
JDay = c(57023, 57024, 57027, 57028, 57029, 57031, 57032, 57035, 57037),
Obs = c(1, 1, 1, 1, 1, 1, 1, 1, 1),
w = c(71, 105, 64, 44, 45, 38, 66, 49, 28),
WStd = c(0, 0, 0, 0, 0, 0, 0, 0, 0),
wolf = c(91.59, 135.45, 82.56, 56.76, 58.05, 49.02, 85.14, 63.21, 36.12),
Adj = c(0, 0, 0, 0, 0, 0, 0, 0, 0)),
.Names = c("JDay", "Obs", "w", "WStd", "wolf", "Adj"),
class = "data.frame",
row.names = c(NA, -9L))
In your comment, you say that rpt and rpt1 have the same data. Therefore, I think this is what you are asking for
library(ggplot2)
ggplot(rpt, aes(x=JDay)) +
geom_line(aes(y=w, color="First line")) +
geom_line(aes(y=wolf, color="Second line")) +
labs(color="Legend text")

Resources