Ggplot visualization improve - r

I have a data frame from a Bici service, it looks like this, where Origen_Id is the station's number, and Num_Viaje_Ori is the total number of trips that start in that station.
Origen_Id
Num_F
Num_M
Num_Viaje_Ori
Destino_Id
Num_F_d
Num_M_d
Num_Viaje_Des
11
1616
3973
5589
11
139
5 3855
5250
34
962
3232
4194
34
1340
4236
5576
35
1321
3993
5314
35
1418
4239
5657
50
1797
4293
6090
50
1785
4314
6099
51
1891
5186
7077
51
3084
7771
10855
52
1379
4320
5699
52
1299
3913
5212
54
1275
3950
5225
54
1373
4046
5419
75
1332
2939
4271
75
1202
2763
3965
194
1346
3792
5138
194
632
1845
2477
271
1511
3640
5151
271
1483
3750
5233
When I run
s<-ggplot(most, aes(x=Origen_Id, y=Num_Viaje_Ori))+geom_bar(stat="identity")
I got
How can I fix it?, I mean, how can I make the bars got closer?

Implementing the commented suggestions, you should get:
library(tidyverse)
library(tibble)
library(ggthemes)
most <-
tibble::tribble(
~Origen_Id, ~Num_F, ~Num_M, ~Num_Viaje_Ori, ~Destino_Id, ~Num_F_d, ~Num_M_d, ~Num_Viaje_Des,
11L, 1616L, 3973L, 5589L, 11L, 139L, "5 3855", 5250L,
34L, 962L, 3232L, 4194L, 34L, 1340L, "4236", 5576L,
35L, 1321L, 3993L, 5314L, 35L, 1418L, "4239", 5657L,
50L, 1797L, 4293L, 6090L, 50L, 1785L, "4314", 6099L,
51L, 1891L, 5186L, 7077L, 51L, 3084L, "7771", 10855L,
52L, 1379L, 4320L, 5699L, 52L, 1299L, "3913", 5212L,
54L, 1275L, 3950L, 5225L, 54L, 1373L, "4046", 5419L,
75L, 1332L, 2939L, 4271L, 75L, 1202L, "2763", 3965L,
194L, 1346L, 3792L, 5138L, 194L, 632L, "1845", 2477L,
271L, 1511L, 3640L, 5151L, 271L, 1483L, "3750", 5233L
)
most %>%
mutate(Origen_Id = as.factor(Origen_Id)) %>%
ggplot(aes(x=Origen_Id, y=Num_Viaje_Ori)) +
geom_col(fill = "darkslateblue") +
ggthemes::theme_economist_white()
Created on 2021-11-23 by the reprex package (v2.0.1)

Related

Sorting a column in a data frame in R

*I wanted to arrange the column "TotalConfirmedCases" in descending order but it sorted in a weird way like 965 is arranged first.
CODE in R: new_Cor_table[rev(order(new_Cor_table$TotalConfirmedCases)),]
Output:
Update: thanks to input of #onyambu:
We could use order with decreasing=TRUE:
newdata <- df[order(df$TotalConfirmedCases, decreasing = TRUE),]
OR
If we want to do it with rev then here is the syntax:
newdata <- df[rev(order(df$TotalConfirmedCases)),]
newdata
County TotalConfirmedCases Totalprobablecases Totalcases Totaldeaths
3 Dakota 95277 23,252 118,529 792
7 Anoka 83623 20,459 104,082 808
26 Washington 57910 14,193 72,103 490
30 Stearns 50672 2,622 53,294 372
34 Olmsted 44718 1,048 45,766 191
36 St. Louis 43103 8,153 51,256 541
2 Douglas 9534 1,962 11,496 118
5 Isanti 8892 1,645 10,537 119
4 Morrison 8892 616 9,508 105
6 Freeborn 8753 679 9,432 77
8 Nicollet 8244 385 8,629 66
9 Becker 7877 1,292 9,169 95
11 Polk 7319 1,852 9,171 109
12 Carlton 7203 2,451 9,654 100
13 Mille Lacs 6962 578 7,540 116
15 Cass 6687 668 7,355 83
16 Todd 6605 486 7,091 61
17 Lyon 6503 759 7,262 74
18 Brown 6460 330 6,790 81
19 Le Sueur 6294 449 6,743 51
21 Pine 6141 1,319 7,460 68
22 Nobles 6025 1,044 7,069 60
23 Dodge 5916 144 6,060 22
24 Meeker 5803 361 6,164 75
25 Wabasha 5795 172 5,967 19
28 Waseca 5314 424 5,738 39
29 Martin 5273 549 5,822 65
31 Fillmore 4953 117 5,070 24
32 Hubbard 4579 556 5,135 60
33 Houston 4498 320 4,818 20
35 Roseau 4327 281 4,608 45
37 Faribault 3759 213 3,972 54
38 Redwood 3661 417 4,078 54
39 Wadena 3636 754 4,390 56
1 Kittson 965 109 1,074 28
10 Lake\tof the Woods 771 34 805 6
14 Red Lake 692 269 961 13
20 Cook 620 12 632 4
27 Traverse 577 313 890 10
>
data:
structure(list(County = c("Kittson", "Douglas", "Dakota", "Morrison",
"Isanti", "Freeborn", "Anoka", "Nicollet", "Becker", "Lake\tof the Woods",
"Polk", "Carlton", "Mille Lacs", "Red Lake", "Cass", "Todd",
"Lyon", "Brown", "Le Sueur", "Cook", "Pine", "Nobles", "Dodge",
"Meeker", "Wabasha", "Washington", "Traverse", "Waseca", "Martin",
"Stearns", "Fillmore", "Hubbard", "Houston", "Olmsted", "Roseau",
"St. Louis", "Faribault", "Redwood", "Wadena"), TotalConfirmedCases = c(965L,
9534L, 95277L, 8892L, 8892L, 8753L, 83623L, 8244L, 7877L, 771L,
7319L, 7203L, 6962L, 692L, 6687L, 6605L, 6503L, 6460L, 6294L,
620L, 6141L, 6025L, 5916L, 5803L, 5795L, 57910L, 577L, 5314L,
5273L, 50672L, 4953L, 4579L, 4498L, 44718L, 4327L, 43103L, 3759L,
3661L, 3636L), Totalprobablecases = c("109", "1,962", "23,252",
"616", "1,645", "679", "20,459", "385", "1,292", "34", "1,852",
"2,451", "578", "269", "668", "486", "759", "330", "449", "12",
"1,319", "1,044", "144", "361", "172", "14,193", "313", "424",
"549", "2,622", "117", "556", "320", "1,048", "281", "8,153",
"213", "417", "754"), Totalcases = c("1,074", "11,496", "118,529",
"9,508", "10,537", "9,432", "104,082", "8,629", "9,169", "805",
"9,171", "9,654", "7,540", "961", "7,355", "7,091", "7,262",
"6,790", "6,743", "632", "7,460", "7,069", "6,060", "6,164",
"5,967", "72,103", "890", "5,738", "5,822", "53,294", "5,070",
"5,135", "4,818", "45,766", "4,608", "51,256", "3,972", "4,078",
"4,390"), Totaldeaths = c(28L, 118L, 792L, 105L, 119L, 77L, 808L,
66L, 95L, 6L, 109L, 100L, 116L, 13L, 83L, 61L, 74L, 81L, 51L,
4L, 68L, 60L, 22L, 75L, 19L, 490L, 10L, 39L, 65L, 372L, 24L,
60L, 20L, 191L, 45L, 541L, 54L, 54L, 56L)), class = "data.frame", row.names = c(NA,
-39L))
I suggest using the rank function, with a negative sign it will reverse the order
new_Cor_table[order (-rank (new_Cor_table$TotalConfirmedCases)),]

Calculate average based on columns in 2 datafarmes and their values via mutate in R?

I have a dataframe structure that calculates the sum of Response.Status found per month with this mutate function:
DF1 <- complete_df %>%
mutate(Month = format(as.Date(date, format = "%Y/%m/%d"), "%m/%Y"),
UNSUBSCRIBE = if_else(UNSUBSCRIBE == "TRUE", "UNSUBSCRIBE", NA_character_)) %>%
pivot_longer(c(Response.Status, UNSUBSCRIBE), values_to = "Response.Status") %>%
drop_na() %>%
count(Month, Response.Status) %>%
pivot_wider(names_from = Month, names_sep = "/", values_from = n)
# A tibble: 7 x 16
Response.Status `01/2020` `02/2020` `03/2020` `04/2020` `05/2020` `06/2020` `07/2020` `08/2020` `09/2019` `09/2020` `10/2019` `10/2020` `11/2019` `11/2020` `12/2019`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 EMAIL_OPENED 1068 3105 4063 4976 2079 1856 4249 3638 882 4140 865 2573 1167 684 862
2 NOT_RESPONDED 3187 9715 13164 15239 5458 4773 12679 10709 2798 15066 2814 8068 3641 1931 2647
3 PARTIALLY_SAVED 5 34 56 8 28 22 73 86 11 14 7 23 8 8 2
4 SUBMITTED 216 557 838 828 357 310 654 621 214 1001 233 497 264 122 194
5 SURVEY_OPENED 164 395 597 1016 245 212 513 625 110 588 123 349 202 94 120
6 UNDELIVERED_OR_BOUNCED 92 280 318 260 109 127 319 321 63 445 69 192 93 39 74
7 UNSUBSCRIBE 397 1011 1472 1568 727 737 1745 2189 372 1451 378 941 429 254 355
What I would like to do is take those values created in table to calculate average based on # of people in each Response.Status group.
structure(list(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED",
"PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED"
), `01/2020` = c(1068L, 3187L, 5L, 216L, 164L, 92L), `02/2020` = c(3105L,
9715L, 34L, 557L, 395L, 280L), `03/2020` = c(4063L, 13164L, 56L,
838L, 597L, 318L), `04/2020` = c(4976L, 15239L, 8L, 828L, 1016L,
260L), `05/2020` = c(2079L, 5458L, 28L, 357L, 245L, 109L), `06/2020` = c(1856L,
4773L, 22L, 310L, 212L, 127L), `07/2020` = c(4249L, 12679L, 73L,
654L, 513L, 319L), `08/2020` = c(3638L, 10709L, 86L, 621L, 625L,
321L), `09/2019` = c(882L, 2798L, 11L, 214L, 110L, 63L), `09/2020` = c(4140L,
15066L, 14L, 1001L, 588L, 445L), `10/2019` = c(865L, 2814L, 7L,
233L, 123L, 69L), `10/2020` = c(2573L, 8068L, 23L, 497L, 349L,
192L), `11/2019` = c(1167L, 3641L, 8L, 264L, 202L, 93L), `11/2020` = c(684L,
1931L, 8L, 122L, 94L, 39L), `12/2019` = c(862L, 2647L, 2L, 194L,
120L, 74L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
I made a separate table that contains sum values based on those group names:
Response.Status
EMAIL_OPENED : 451
NOT_RESPONDED : 1563
PARTIALLY_SAVED : 4
SUBMITTED : 71
SURVEY_OPENED : 53
UNDELIVERED_OR_BOUNCED: 47
UNSUBSCRIBE: 135
If I understood your problem correctly you have 2 data.frame/tibbles. One that is shown in the "structure" part an one that informs the quantity of people/users per response status. Now you want to get the value per person. If so this is a possible solution:
# people/users data set
df2 <- data.frame(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED", "PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED", "UNSUBSCRIBE"),
PEOPLE = c(451, 1563, 4, 71, 53, 47, 135))
df %>% # this is your "structure"
tidyr::pivot_longer(-Response.Status, names_to = "DATE", values_to = "nmbr") %>%
dplyr::group_by(Response.Status) %>%
dplyr::summarise(SUM = sum(nmbr)) %>%
dplyr::inner_join(df2) %>%
dplyr::mutate(MEAN_PP = SUM / PEOPLE)
Response.Status SUM PEOPLE MEAN_PP
<chr> <int> <dbl> <dbl>
1 EMAIL_OPENED 36207 451 80.3
2 NOT_RESPONDED 111889 1563 71.6
3 PARTIALLY_SAVED 385 4 96.2
4 SUBMITTED 6906 71 97.3
5 SURVEY_OPENED 5353 53 101
6 UNDELIVERED_OR_BOUNCED 2801 47 59.6

Finding Unique per group with filter

My data looks like this:
date schedule_id food_truck_id building_id truck_status last_confirmed_date dsle
2018-04-26 422 58 30 accepted_event 0 31
2018-04-26 422 59 30 accepted_event 2018-02-27 11
2018-04-26 422 65 30 accepted_event 2018-03-15 12
2018-04-26 422 88 30 accepted_event 2018-02-20 7
2018-04-26 422 89 30 accepted_event 2018-03-22 13
2018-04-26 422 101 30 accepted_event 2018-02-06 16
2018-04-26 422 120 30 accepted_event 2018-03-06 14
2018-04-26 422 135 30 accepted_event 2018-03-13 21
2018-04-26 399 42 33 accepted_event 2018-03-15 8
2018-04-26 399 58 33 accepted_event 0 31
2018-04-26 399 59 33 accepted_event 2018-03-01 11
2018-04-26 399 65 33 accepted_event 2018-02-27 12
2018-04-26 399 88 33 accepted_event
Can be reproduced using:
structure(list(date = structure(c(17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647), class = "Date"), schedule_id = c(422L,
422L, 422L, 422L, 422L, 422L, 422L, 422L, 399L, 399L, 399L, 399L,
399L, 399L, 399L, 399L, 399L), food_truck_id = c(58L, 59L, 65L,
88L, 89L, 101L, 120L, 135L, 42L, 58L, 59L, 65L, 88L, 89L, 101L,
120L, 135L), building_id = c(30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), truck_status = c("accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event"
), last_confirmed_date = c("0", "2018-02-27", "2018-03-15", "2018-02-20",
"2018-03-22", "2018-02-06", "2018-03-06", "2018-03-13", "2018-03-15",
"0", "2018-03-01", "2018-02-27", "0", "2018-03-06", "2018-03-13",
"0", "2018-02-22"), dsle = c(31, 11, 12, 7, 13, 16, 14, 21, 8,
31, 11, 12, 7, 13, 16, 14, 21)), .Names = c("date", "schedule_id",
"food_truck_id", "building_id", "truck_status", "last_confirmed_date",
"dsle"), row.names = c(142L, 223L, 379L, 455L, 495L, 589L, 806L,
877L, 63L, 155L, 215L, 287L, 452L, 483L, 667L, 809L, 894L), class = "data.frame")
My goal is to only select the food_truck_id based on max(dsle) but it should be unique per date. For instance, for schedule_id 422, food_truck_id with max(dsle) is 58, it is also 58 for schedule_id 399.
What I want is, let's say for 422, it is 58, but for 399, it should be next max(dsle) other than 58.
I have tried the following but it doesn't gives what I want.
testxx %>%
group_by(schedule_id) %>%
distinct(food_truck_id, date, dsle) %>%
filter(dsle == max(dsle))
The result I want is following
date schedule_id food_truck_id
2018-04-26 422 58
2018-04-26 399 135
because 135 next to 58 has max(dsle)
Updated to account for date
This might be one of those occasions where a loop is the best/easiest solution.
However, it does a join operation in the loop, so there will be some optimisations that can be made
The idea is to loop over each schedule_id, and keep track of which food_trucks have already been used on which date.
If we do some pre-arranging of the data before the loop it makes things easier
df <- df %>%
arrange(schedule_id, -dsle)
## pre-allocate a result data.frame
ids <- unique(df$schedule_id)
df_res <- data.frame(schedule_id = ids,
food_truck_id = NA)
usedTrucks <- data.frame(date = as.Date(NA),
schedule_id = ids,
food_truck_id = NA_integer_)
counter <- 1
for(i in ids) {
possibleTrucks <- df[df$schedule_id %in% i, c("date", "food_truck_id")]
## possible Trucks will be in order, as we have pre-arranged the data
## use the first one that hasn't already been used
## on the given date
possibleTrucks <- anti_join(possibleTrucks, usedTrucks, by = c("date", "food_truck_id"))
thisTruck <- possibleTrucks[1, c("food_truck_id", "date")]
df_res[counter, 'food_truck_id'] <- thisTruck$food_truck_id
usedTrucks[counter, "food_truck_id"] <- thisTruck$food_truck_id
usedTrucks[counter, "date"] <- thisTruck$date
counter <- counter + 1
}
df_res
# schedule_id food_truck_id
# 1 399 58
# 2 422 135
If speed is an issue on a larger data set this can be re-written in Rcpp to make it much faster.
p<-df %>% arrange(desc(schedule_id), desc(dsle)) %>% slice(1) %>% select(date,dsle,schedule_id,food_truck_id)
df %>% subset(!(schedule_id%in%c(p))) %>% subset(!(dsle%in%c(p))) %>% select(date,dsle,schedule_id,food_truck_id) %>% arrange(desc(dsle)) %>% slice(1) %>%
rbind(p,.) %>% select(-dsle)
output
# A tibble: 2 x 3
date schedule_id food_truck_id
<date> <int> <int>
1 2018-04-26 422 58
2 2018-04-26 399 135

Vectorizing a for-loop that eliminates duplicate data in dataframe R

I am working with a difficult data manipulation question in R. I am currently using a for-loop to approach the problem, however I would like to vectorize this to have it scale better. I have the following dataframe to work with:
dput(mydf)
structure(list(team_id = c(14L, 14L, 7L, 7L, 21L, 21L, 15L, 15L
), opp_team_id = c(7L, 7L, 14L, 14L, 15L, 15L, 21L, 21L), pg = c(3211L,
3211L, 786L, 786L, 3914L, 644L, 1524L, 593L), sg = c(653L, 4122L,
1512L, 1512L, 2593L, 10L, 54L, 54L), sf = c(4122L, 1742L, 2347L,
2347L, 1352L, 3378L, 2843L, 1062L), pf = c(1742L, 886L, 79L,
1134L, 687L, 1352L, 1376L, 1376L), c = c(3014L, 2604L, 2960L,
2960L, 21L, 3216L, 1256L, 3017L), opp_pg = c(3982L, 3982L, 3211L,
4005L, 1524L, 1524L, 3914L, 644L), opp_sg = c(786L, 2347L, 653L,
653L, 54L, 802L, 2593L, 10L), opp_sf = c(1134L, 1134L, 4122L,
1742L, 1062L, 1062L, 3105L, 3105L), opp_pf = c(183L, 183L, 1742L,
886L, 3017L, 1376L, 3216L, 2135L), opp_c = c(2475L, 2960L, 3138L,
3138L, 1256L, 3017L, 21L, 1957L)), .Names = c("team_id", "opp_team_id",
"pg", "sg", "sf", "pf", "c", "opp_pg", "opp_sg", "opp_sf", "opp_pf",
"opp_c"), row.names = c(NA, -8L), class = "data.frame")
mydf
team_id opp_team_id pg sg sf pf c opp_pg opp_sg opp_sf opp_pf opp_c
1 14 7 3211 653 4122 1742 3014 3982 786 1134 183 2475
2 14 7 3211 4122 1742 886 2604 3982 2347 1134 183 2960
3 7 14 786 1512 2347 79 2960 3211 653 4122 1742 3138
4 7 14 786 1512 2347 1134 2960 4005 653 1742 886 3138
5 21 15 3914 2593 1352 687 21 1524 54 1062 3017 1256
6 21 15 644 10 3378 1352 3216 1524 802 1062 1376 3017
7 15 21 1524 54 2843 1376 1256 3914 2593 3105 3216 21
8 15 21 593 54 1062 1376 3017 644 10 3105 2135 1957
Based on my problem at hand, rows 3-4 and 7-8 are duplicates in this dataframe. Rows 3-4 are duplicates of rows 1-2, and rows 7-8 are duplicates on rows 5-6. This is sports data, and rows 3-4 are essentially rows 1 and 2 except with the team_id and opp_team_id switched, and the same for the other 10 columns (for the most part).
Here is my for-loop for removing duplicates, which I think is quite creative, but is a for-loop nonetheless:
indices = c(1)
TFSwitch = TRUE
for(i in 2:nrow(mydf)) {
last_row = mydf$team_id[(i-1)]
this_row = mydf$team_id[i]
TFSwitch = ifelse(last_row != this_row, !TFSwitch, TFSwitch)
if(TFSwitch == TRUE) {
indices = c(indices, i)
}
}
This for-loop goes back and forth checking if the teamID column changes from row to row, and if it does, it toggles TFSwitch from TRUE to FALSE, or vice versa. It then saves the indices I want to keep in a vector.
I would like to vectorize this - any thoughts would be greatly appreciated!
This is very similar to previous problems involving pairwise duplicate removal like: (pair-wise duplicate removal from dataframe). So following a similar procedure, and adding a little merge() back to get the indices, you can do:
vars <- c("team_id","opp_team_id")
mx <- do.call(pmax, mydf[vars])
mn <- do.call(pmin, mydf[vars])
merge(
cbind(mydf[vars], ind=seq_len(nrow(mydf))),
mydf[!duplicated(data.frame(mx,mn)), vars]
)[,"ind"]
# [1] 1 2 5 6
Here the same solution using data.table. My understating is that you want to remove duplicated by pairs not just finding unique indices.
library(data.table)
setDT(mydf)
mydf[,c("id1","id2"):=list(pmax(team_id,opp_team_id),pmin(team_id,opp_team_id))]
setkey(mydf,team_id,opp_team_id)[unique(mydf,by=c("id1","id2"))]

Boxplot using summary instead of raw data

I am still new to ggplot2. I want to plot a box plot but instead of the raw data I have the summary points.
Page_Type ID Count min 5% 25% 50% 75% 95% Max Avg
3 24559 173 408 479.45 615.25 800.5 1547.25 4436.8 7068 1350.138462
3 24560 101 0 480 631 871 1762 5183 65177 2702.245902
6 24559 69 490 664 1181 1807 3221 4845.5 6397 2287.45098
6 24560 10 1086 1254.4 1928 1970 2007 5236.6 6044 2607
46 24559 49 217 252.45 438.75 595 1198 2647.15 4316 939.6666667
46 24560 31 266 337 467 640 1123 2531.6 5232 989.2758621
69 24559 424 644 761.8 957 1292 2212 4938.6 11246 1881.785467
69 24560 216 601 848.85 1060.25 1488.5 2465 5314.7 7981 2094.007692
82 24559 62 922 1018.2 1305 1534 1966 3313.8 22461 2325.810811
82 24560 137 630 926.6 1156 1468 2281 3764.6 11364 1922.252632
the dput output is as follows:
structure(list(Page_Type = c(3L, 3L, 6L, 6L, 46L, 46L, 69L, 69L,
82L, 82L), ID = c(24559L, 24560L, 24559L, 24560L, 24559L, 24560L,
24559L, 24560L, 24559L, 24560L), Count = c(173L, 101L, 69L, 10L,
49L, 31L, 424L, 216L, 62L, 137L), min = c(408L, 0L, 490L, 1086L,
217L, 266L, 644L, 601L, 922L, 630L), X5. = c(479.45, 480, 664,
1254.4, 252.45, 337, 761.8, 848.85, 1018.2, 926.6), X25. = c(615.25,
631, 1181, 1928, 438.75, 467, 957, 1060.25, 1305, 1156), X50. = c(800.5,
871, 1807, 1970, 595, 640, 1292, 1488.5, 1534, 1468), X75. = c(1547.25,
1762, 3221, 2007, 1198, 1123, 2212, 2465, 1966, 2281), X95. = c(4436.8,
5183, 4845.5, 5236.6, 2647.15, 2531.6, 4938.6, 5314.7, 3313.8,
3764.6), Max = c(7068L, 65177L, 6397L, 6044L, 4316L, 5232L, 11246L,
7981L, 22461L, 11364L), Avg = c(1350.138462, 2702.245902, 2287.45098,
2607, 939.6666667, 989.2758621, 1881.785467, 2094.007692, 2325.810811,
1922.252632)), .Names = c("Page_Type", "ID", "Count", "min",
"X5.", "X25.", "X50.", "X75.", "X95.", "Max", "Avg"), class = "data.frame", row.names = c(NA,
-10L))
There are 5 page types and each page type has 2 ids. I want to show the various summary metrics (min, 5%, 25% ...) as a box plot. I am ok with skiping the 5% and 95% data points to fit the more traditional look. How do I create a box plot from this data?
There is also a count column which shows how many point were used to get the summary. If this can be overlayed on the same plot great else it can be a different plot as well.
You can make boxplot with geom_boxplot() by providing your own min, max, middle, upper and lower values, only in this case you should add stat="identity" inside geom_boxplot().
ggplot(df,aes(x=as.factor(Page_Type),
ymin=min,lower=X5.,middle=X50.,upper=X75.,ymax=Max,fill=as.factor(ID)))+
geom_boxplot(stat="identity")

Resources