GGplot boxplot and dotplot side by side [duplicate] - r

This question already has answers here:
How to plot a hybrid boxplot: half boxplot with jitter points on the other half?
(3 answers)
Closed 4 years ago.
I'm using R and GGplot2 and I want to make a graph in which there are boxes (from geom_boxplot) and points (from geom_dotplot) one next to the other, not above one another.
The picture shows what I've managed to do for now, which is pretty much what I'd like to have except that, instead of the pink boxes, I want just the points from the geom_dotplot
df2%>%ggplot(aes(factor(Organ), value,fill=Crop)) +
geom_boxplot() +
#geom_dotplot(binaxis='y',stackdir = 'center')
scale_y_log10("BCF") + facet_wrap(~Csoil)+theme(axis.title.x=element_blank())
This is a subset of the data that I'm using:
df2=structure(list(variable = structure(c(8L, 2L, 6L, 14L, 5L, 14L,
3L, 8L, 5L, 6L, 2L, 13L, 5L, 7L, 13L, 3L, 8L, 10L, 10L, 2L, 2L,
8L, 6L, 9L, 10L, 14L, 14L, 1L, 4L, 13L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), .Label = c("BCF leaf (CsoilMax) - Ryegrass",
"BCF root (CsoilMax) - Ryegrass", "BCF root (CsoilMed) - Ryegrass",
"BCF stem (CsoilMax) - Ryegrass", "BCF stem (CsoilMed) - Ryegrass",
"BCF leaf (CsoilMed) - Ryegrass", "BCF fruit (CsoilMax) - Maize",
"BCF fruit (CsoilMed) - Maize", "BCF leaf (CsoilMax) - Maize",
"BCF leaf (CsoilMed) - Maize", "BCF root (CsoilMax) - Maize",
"BCF root (CsoilMed) - Maize", "BCF stem (CsoilMax) - Maize",
"BCF stem (CsoilMed) - Maize"), class = "factor"), value = c(0.209606259766772,
0.506401960143269, 0.000660265316109507, 6.51930210129075, 0.0207635735841085,
0.765736181143394, 1.40601301731962, 0.00743553520916094, 0.040587560806454,
1.60441689044071, 0.522063333823462, 0.540632385379595, 0.0497467701708571,
0.0573435549206478, 0.0109218792177719, 0.608263306408034, 0.911628697794879,
0.0874473327218576, 0.105726788757446, 0.267232334133824, 1.3297503892306,
0.292440363142525, 0.037969380254565, 0.000157551659778798, 0.237462116816578,
0.0447547790805731, 1.29271738284098, 0.000673871813931306, 0.00388673588576815,
0.0223297222656565, 6.82, 11.53, 5.39, 5, 5, 0.005, 107.984,
26.987, 0.005, 0.005, 132.28), Crop = c("Maize", "Ryegrass",
"Ryegrass", "Maize", "Ryegrass", "Maize", "Ryegrass", "Maize",
"Ryegrass", "Ryegrass", "Ryegrass", "Maize", "Ryegrass", "Maize",
"Maize", "Ryegrass", "Maize", "Maize", "Maize", "Ryegrass", "Ryegrass",
"Maize", "Ryegrass", "Maize", "Maize", "Maize", "Maize", "Ryegrass",
"Ryegrass", "Maize", "Literature", "Literature", "Literature",
"Literature", "Literature", "Literature", "Literature", "Literature",
"Literature", "Literature", "Literature"), Csoil = c("Median soil C",
"Maximum soil C", "Median soil C", "Median soil C", "Median soil C",
"Median soil C", "Median soil C", "Median soil C", "Median soil C",
"Median soil C", "Maximum soil C", "Maximum soil C", "Median soil C",
"Maximum soil C", "Maximum soil C", "Median soil C", "Median soil C",
"Median soil C", "Median soil C", "Maximum soil C", "Maximum soil C",
"Median soil C", "Median soil C", "Maximum soil C", "Median soil C",
"Median soil C", "Median soil C", "Maximum soil C", "Maximum soil C",
"Maximum soil C", "Median soil C", "Median soil C", "Median soil C",
"Median soil C", "Median soil C", "Median soil C", "Median soil C",
"Median soil C", "Median soil C", "Median soil C", "Median soil C"
), Organ = c("Fruits", "Roots", "Leaves", "Stem", "Stem", "Stem",
"Roots", "Fruits", "Stem", "Leaves", "Roots", "Stem", "Stem",
"Fruits", "Stem", "Roots", "Fruits", "Leaves", "Leaves", "Roots",
"Roots", "Fruits", "Leaves", "Leaves", "Leaves", "Stem", "Stem",
"Leaves", "Stem", "Stem", "Leaves lit.", "Leaves lit.", "Roots lit.",
"Roots lit.", "Roots lit.", "Fruits lit.", "Fruits lit.", "Fruits lit.",
"Fruits lit.", "Fruits lit.", "Fruits lit.")), .Names = c("variable",
"value", "Crop", "Csoil", "Organ"), row.names = c(NA, -41L), class = "data.frame")

The solution was just filtering the data within ggplot.
This is the code of the graph:
df2%>%ggplot(aes(factor(Organ), value,fill=Crop)) +
geom_boxplot(data=.%>%filter(Crop!="Literature")) +
geom_dotplot(data=.%>%filter(Crop=="Literature"),binaxis='y',stackdir = 'center',dotsize=0.35,fill="black") +
scale_y_log10("BCF") + facet_wrap(~Csoil)+theme(axis.title.x=element_blank())

Not a real answer but a suggestion: I tend to prefer geom_jitter(size = 0.5, width = 0.2) to overlay dots with boxplots, and use the outlier.size = NULL argument in geom_boxplot not to duplicate dots.

Related

How to replace NA values in dataframe's column with my own values in R?

I want to replace NA values in this column "Outlet_Size" with its particular modes w.r.t other column "Outlet_Type"
if it helps, I am using this dataset
edit:
> dput(rbind(head(data[is.na(data$Outlet_Type),]),head(data[!is.na(data$Outlet_Type),])))
structure(list(Item_Identifier = c("FDW58", "FDW14", "NCN55",
"FDQ58", "FDY38", "FDH56"), Item_Weight = c(20.75, 8.3, 14.6,
7.315, 12.6956333687566, 9.8), Item_Fat_Content = c("Low Fat",
"Regular", "Low Fat", "Low Fat", "Regular", "Regular"), Item_Visibility = c(0.007564836,
0.038427677, 0.099574908, 0.015388393, 0.118599314, 0.063817206
), Item_Type = c("Snack Foods", "Dairy", "Others", "Snack Foods",
"Dairy", "Fruits and Vegetables"), Item_MRP = c(107.8622, 87.3198,
241.7538, 155.034, 234.23, 117.1492), Outlet_Identifier = c("OUT049",
"OUT017", "OUT010", "OUT017", "OUT027", "OUT046"), Outlet_Establishment_Year = c(1999L,
2007L, 1998L, 2007L, 1985L, 1997L), Outlet_Size = c("Medium",
NA, NA, NA, "Medium", "Small"), Outlet_Location_Type = c("Tier 1",
"Tier 2", "Tier 3", "Tier 2", "Tier 3", "Tier 1"), Outlet_Type = c("Supermarket Type1",
"Supermarket Type1", "Grocery Store", "Supermarket Type1", "Supermarket Type3",
"Supermarket Type1")), row.names = c(NA, -6L), class = "data.frame")
> modes
Outlet_Type mode
<chr> <chr>
1 Grocery Store Small
2 Supermarket Type1 Small
3 Supermarket Type2 Medium
4 Supermarket Type3 Medium
I tried,
> data2$Outlet_Size[which(is.na(data2$Outlet_Size))]<-modes$Outlet_Size[data$Outlet_Type == modes$Outlet_Type]
Warning messages:
1: In data$Outlet_Type == modes$mode :
longer object length is not a multiple of shorter object length
2: In data2$Outlet_Size[which(is.na(data2$Outlet_Size))] <- modes$Outlet_Size[data$Outlet_Type == :
number of items to replace is not a multiple of replacement length
but it doesn't seem to work as intended. However, it does remove NA values by 2 every time I run this
> sum(is.na(data))
[1] 1606
> sum(is.na(data2))
[1] 1600

ggplot by group with filter()

I have big dataset with the following format:
structure(list(LOCATION = c("CAN", "CAN", "CAN", "CAN", "CAN",
"CAN", "CAN", "CAN", "CAN", "CAN"), Country = c("Canada", "Canada",
"Canada", "Canada", "Canada", "Canada", "Canada", "Canada", "Canada",
"Canada"), SUBJECT = c("ULABUL99", "ULABUL99", "ULABUL99", "ULABUL99",
"ULABUL99", "ULABUL99", "ULABUL99", "ULABUL99", "ULABUL99", "ULABUL99"
), Subject = c("Unit Labour Cost", "Unit Labour Cost", "Unit Labour Cost",
"Unit Labour Cost", "Unit Labour Cost", "Unit Labour Cost", "Unit Labour Cost",
"Unit Labour Cost", "Unit Labour Cost", "Unit Labour Cost"),
SECTOR = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Sector = c("Total Economy",
"Total Economy", "Total Economy", "Total Economy", "Total Economy",
"Total Economy", "Total Economy", "Total Economy", "Total Economy",
"Total Economy"), MEASURE = c("ST", "ST", "ST", "ST", "ST",
"ST", "ST", "ST", "ST", "ST"), Measure = c("Level, ratio or national currency",
"Level, ratio or national currency", "Level, ratio or national currency",
"Level, ratio or national currency", "Level, ratio or national currency",
"Level, ratio or national currency", "Level, ratio or national currency",
"Level, ratio or national currency", "Level, ratio or national currency",
"Level, ratio or national currency"), FREQUENCY = c("A",
"A", "A", "A", "A", "A", "A", "A", "A", "A"), Frequency = c("Annual",
"Annual", "Annual", "Annual", "Annual", "Annual", "Annual",
"Annual", "Annual", "Annual"), TIME = 1970:1979, Time = 1970:1979,
Value = c(0.1304592, 0.1357066, 0.1430287, 0.1521136, 0.1752398,
0.2018611, 0.2193767, 0.2347496, 0.2470616, 0.2663881), Flag.Codes = c("E",
"E", "E", "E", "E", "E", "E", "E", "E", "E"), Flags = c("Estimated value",
"Estimated value", "Estimated value", "Estimated value",
"Estimated value", "Estimated value", "Estimated value",
"Estimated value", "Estimated value", "Estimated value")), row.names = c(NA,
10L), class = "data.frame")
And I want to draw time plot like the following (for each sector group in a particular country's particular subject, in this case, Germany's Labour Income Share)
I tried to code as follows:
library(ggplot2)
library(tidyr)
df <- read.csv("/Users/ulc.csv", header = TRUE)
fsector = factor(df$SECTOR)
df %>%
filter(df$MEASURE =="ST",
df$SUBJECT == "ULAIRU99",
df$LOCATION == "DEU") %>%
ggplot(aes(x = df$year, y = df$value, color = fsector, linetype = fsector)) +
scale_color_manual(labels=c("Sec 1","Sec 2", "Sec 3", "Sec 4", "Sec 5", "Sec 6", "Sec 7", "Sec 8"), values = 1:8) +
scale_linetype_manual(labels=c("Sec 1","Sec 2", "Sec 3", "Sec 4", "Sec 5", "Sec 6", "Sec 7", "Sec 8"), values = 1:8) +
theme(legend.position = c(0.8, 0.3), legend.title = element_blank()) +
ylab("LIS of Germany by sector") + xlab("year")
But the result does not show any plots and seems like a lot of elements are missing in my code. Maybe should I add geom_line() for each sector? But there seems much simpler way. Any help would be appreciated.
You can try the following code -
library(dplyr)
library(ggplot2)
df %>%
filter(MEASURE =="ST",SUBJECT == "ULAIRU99",LOCATION == "DEU") %>%
mutate(SECTOR = factor(SECTOR)) %>%
ggplot(aes(x = TIME, y = Value, color = SECTOR, linetype = SECTOR)) +
geom_line() +
scale_color_manual(labels=c("Sec 1","Sec 2", "Sec 3", "Sec 4", "Sec 5", "Sec 6", "Sec 7", "Sec 8"), values = 1:8) +
scale_linetype_manual(labels=c("Sec 1","Sec 2", "Sec 3", "Sec 4", "Sec 5", "Sec 6", "Sec 7", "Sec 8"), values = 1:8) +
theme(legend.position = c(0.8, 0.3), legend.title = element_blank()) +
ylab("LIS of Germany by sector") + xlab("year")

Apply a function to particular rows

I want to apply the following code to only the first 3 rows (if it's applied to the second 3, it fails to parse. netflix_and_disney$release_year <-year(dmy(netflix_and_disney$release_year))
Is there a way about doing this with this df?
structure(list(show_id = c("00147800", "07019028", "00115433", "70234439", "80058654", "80125979"), title = c("10 Things I Hate About You", "101 Dalmatian Street", "101 Dalmatians", "Transformers Prime", "Transformers: Robots in Disguise", "#realityhigh"), type = c("Movie", "Tv Show", "Movie", "Tv Show", "Tv Show", "Movie"), rating = c("PG-13", "N/A", "G", "TV-Y7-FV", "TV-Y7", "TV-14"), release_year = c("31 Mar 1999", "25 Mar 2019", "27 Nov 1996", "2013", "2016", "2017"), date_added = structure(c(18212, 18320, 18212, 17782, 17782, 17417), class = "Date"), duration = c("97 min", "N/A", "103 min", "1 Season", "1 Season", "99 min"), genre = c("Comedy, Drama, Romance", "Animation, Comedy, Family", "Adventure, Comedy, Crime, Family", "Kids' TV", "Kids' TV", "Comedies"), director = c("Gil Junger", "N/A", "Stephen Herek", NA, NA, "Fernando Lebrija"), country = c("USA", "UK, USA, Canada", "USA, UK", "United States", "United States", "United States"), imdb_rating = c("7.3", "6.2", "5.7", NA, NA, NA), platform = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Disney", "Netflix"), class = "factor")), row.names = c(1L, 2L, 3L, 995L, 996L, 997L), class = "data.frame")
I have tried applying to a subset of the df but has failed to work, as well as applying the which() function
It really all depends on your data and what function you want to apply. But, in principle, you can do this by subsetting your dataframe:
Data:
set.seed(123)
df <- data.frame(
v1 = rnorm(20),
v2 = runif(20),
v3 = sample(20)
)
Here we apply the function meanto the first ten rows of df:
apply(df[1:10,], 1, mean)
1 2 3 4 5 6 7 8 9 10
4.5274415 0.7281229 2.9908109 1.8131179 6.7605775 6.6179570 4.2313168 3.4003004 3.1930399 5.8040552
netflix_and_disney$release_year[1:3] < year(dmy(netflix_and_disney$release_year[1:3]))

Stacked Bar ordered by Sum of Fill with ggplot2

With the following data (an already melted data frame):
df1<-structure(list(Speciality = structure(27:32, .Label = c("Addiction Medicine",
"Anesthesiology", "Cardiac Electrophysiology", "Cardiology",
"Dermatology", "Emergency Medicine", "Family Medicine", "Gastroenterology",
"General Surgery", "Hematology & Oncology", "Hospitalist", "Internal Medicine",
"Nephrology", "Neurological Surgery", "Neurology", "Obstetrics & Gynecology",
"Otolaryngology", "Pain Medicine", "Pathology", "Pediatric Critical Care Medicine",
"Pediatric Hematology-Oncology", "Pediatric Pulmonology", "Pediatric Radiology",
"Pediatric Surgery", "Pediatrics", "Psychiatry", "Pulmonology",
"Radiation Oncology", "Radiology", "Surgical Oncology", "Urology",
"Vascular Surgery"), class = "factor"), PhysAge = structure(c(5L,
5L, 1L, 3L, 5L, 5L), .Label = c("25-34", "35-44", "45-54", "55-64",
"65+"), class = "factor"), value = c(0.0035, 0.0058, 0.0089, 0, 0.00512820512820513,
0.00512820512820513)), .Names = c("Speciality", "PhysAge", "value"
), row.names = 155:160, class = "data.frame")
How can I reorder in ggplot based on the sum of values for each Speciality in a stacked bar chart. I've found some options where the value is multiple columns, but in this case it's one value column.
Currently plotting by:
ggplot(df,aes(x=Speciality,y=value,fill=PhysAge))+
geom_bar(stat="identity")
You could try
set.seed(1)
df <- rbind(
AgevsPractice.melt,
transform(AgevsPractice.melt, PhysAge="1", value=runif(6, 0, 0.01)),
transform(AgevsPractice.melt, PhysAge="10", value=runif(6, 0, 0.01))
)
ggplot(df,aes(x=reorder(Speciality, value, sum), y=value,fill=PhysAge))+
geom_bar(stat="identity")

Get percentage in a dataframe with ddply function

I have this dataframe mydf
structure(list(Driver = c("Crop agriculture", "Crop agriculture",
"Infrastructure", "Infrastructure", "Mining", "Mining", "Mixed Agriculture",
"Mixed Agriculture", "Other land use", "Other land use", "Pasture",
"Pasture", "Tree crops", "Tree crops", "Water", "Water"), Period = c("1990-2000",
"1990-2005", "1990-2000", "1990-2005", "1990-2000", "1990-2005",
"1990-2000", "1990-2005", "1990-2000", "1990-2005", "1990-2000",
"1990-2005", "1990-2000", "1990-2005", "1990-2000", "1990-2005"
), Total = c(120328.157829121, 301821.02190182, 12829.2774726025,
10727.4383383233, 1087.58971425679, 639.851573022215, 27213.5917382956,
19832.3424927037, 72326.7471322223, 64524.3243532213, 1064383.44273723,
1347648.2335736, 7814.32273630087, 7672.0730281537, 20332.6943805768,
17504.7712037337), n = c("n = 1669", "n = 783", "n = 298", "n = 151",
"n = 20", "n = 7", "n = 1355", "n = 925", "n = 1623", "n = 851",
"n = 10986", "n = 6039", "n = 316", "n = 211", "n = 466", "n = 244"
)), .Names = c("Driver", "Period", "Total", "n"), class = "data.frame", row.names = c(NA,
-16L))
The idea is to get the percentage of each driver for the period. I have tried the ddply function and get this line code.
Percentage<- ddply(mydf, c("Driver", "Period"), summarise,
percent= ((Total/sum(Total))*100))
Howver, I only get 100% values for all the cells. Does someone know what I am doing wrong?
In your call, when you do sum(Total) you are using the total value of the group, which when used with Total/sum(Total) simply produces 1 for this data/grouping. You could calculate the total sum from the entire data set by using df$Total in the sum() call. With ddply this would be
ddply(df, .(Driver, Period), summarise, Pct = Total/sum(df$Total) * 100)
And here's the dplyr equivalent
library(dplyr)
group_by(df, Driver, Period) %>% summarise(Pct = Total/sum(df$Total) * 100)

Resources