I have a data set as follow :
data = structure(list(year = c(2021L, 2021L, 2021L, 2021L, 2021L, 2021L,
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L,
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2021L
), month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), Quarter = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 4L, 4L), project = c("A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B"), value = c(102.349429992044,
106.58161342807, 100.435891304271, 98.444807600341, 82.3101711995535,
59.6035963287678, 69.6231234694286, 90.5898095230998, 80.6258589573775,
115.639565579428, 104.73836165791, 107.508003106277, 90.4082358328098,
112.579438593004, 106.624680336326, 93.9307819392979, 75.4136657889693,
52.3110190297094, 70.3105808070076, 87.3448099614908, 68.2935766548446,
124.204436344695, 111.619576683155, 109.225885313817), Country = c("Denmark",
"Denmark", "Denmark", "Denmark", "Denmark", "Denmark", "Denmark",
"Denmark", "Denmark", "Denmark", "Denmark", "Denmark", "Germany",
"Germany", "Germany", "Germany", "Germany", "Germany", "Germany",
"Germany", "Germany", "Germany", "Germany", "Germany"), LongTermWI = c(121.960664674364,
104.723767102727, 109.956110038786, 94.7909742884892, 89.0611848528951,
83.0143004308842, 78.5554847511495, 82.1932844238529, 94.8317262446894,
109.741770216839, 109.224438221904, 121.94629475342, 124.912696115337,
106.137678558707, 111.196799677912, 90.7373556419141, 88.5814900982324,
78.4127049610748, 74.8773631279842, 81.5579488440033, 93.2896819041917,
114.322908768119, 114.660984633633, 121.312387668891), MinRef = c(89.0152351848971,
47.1805056248264, 72.920410008137, 66.0807724144165, 54.5679150901317,
53.7844552456038, 42.6401185444772, 52.546635367643, 69.2248217126283,
76.4144846076876, 89.4209199082177, 80.3882525480035, 90.4082358328098,
64.6192521242945, 85.1337944481354, 69.4221826905899, 50.3506836843003,
52.3110190297094, 40.4296442260575, 47.5775452531874, 68.2935766548446,
71.9901338300631, 93.2483160688902, 85.5467987151896), MaxRef = c(163.771100449271,
141.388975655703, 137.780711496641, 118.055928781909, 113.961805078013,
114.604519185711, 104.83540276271, 101.855462747317, 119.07394843672,
137.773221892607, 140.864382733085, 156.516066856324, 158.822912815973,
134.265032081886, 134.231205540578, 108.891671902872, 118.091190791042,
100.740245891658, 95.6179422824695, 101.998782325545, 132.191355352224,
137.281168224106, 153.155278763207, 152.772666775097), Delta = c(-19.61123468232,
1.85784632534323, -9.52021873451493, 3.6538333118518, -6.75101365334163,
-23.4107041021164, -8.93236128172087, 8.39652509924694, -14.2058672873119,
5.8977953625887, -4.48607656399371, -14.4382916471429, -34.5044602825272,
6.44176003429672, -4.5721193415857, 3.19342629738384, -13.1678243092631,
-26.1016859313654, -4.56678232097656, 5.78686111748746, -24.9961052493471,
9.88152757657565, -3.04140795047796, -12.0865023550742)), row.names = c(NA,
-24L), class = c("tbl_df", "tbl", "data.frame"))
I can plot the project data in a grouped bar chart in Plotly and add the LongTermWI as line.
I don't know how could I plot the lines in different color that the bar chart !
fig <- plot_ly(data , x = ~month, y = ~value, type = 'bar', color =~project)%>%
add_trace(x = ~month, y = ~LongTermWI, type = 'scatter', mode = 'lines')
fig
The colors command did not help ! Also for legend I would like to see the project + country as legend !
There are many different ways to accomplish your goal. It really depends on what the ultimate goal is. I'm going to show you two different ways that this could work.
In both of these approaches, I essentially make it so that each trace is independent.
In this method, I get to pick the exact color. If there were more than a few possible colors, I would use vectorization or a loop (i.e., lapply, for, etc.)
fig <- plot_ly() %>%
add_bars(data = data , x = ~month, y = ~value, color = ~project) %>%
add_lines(data = data[data$project == "A",], x = ~month,
y = ~LongTermWI, color = I("red")) %>%
add_lines(data = data[data$project == "B", ], x = ~month,
y = ~LongTermWI, color = I("black"))
In this next option, I let Plotly choose the colors. Instead of designating color variable (and Plotly potentially catching on that the color groups have the same name), I use split.
plot_ly() %>%
add_bars(data = data, x = ~month, y = ~value, color = ~project) %>%
add_lines(data = data, x = ~month, y = ~LongTermWI, split = ~project)
Related
Dataframe "id" has the columns year, id, and matriline, where each row is an incident. I wanted to count the number of incidents by matriline per year, so I did:
events.bymatr =
id %>%
group_by(year, matr, .drop = FALSE) %>%
dplyr::summarise(n = n()) %>%
ungroup()
events.bymatr
I plotted a line graph of the number of incidents over time, by matriline.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=matr))
My question is twofold:
Is there a way I could recreate this line graph where the thickness of the lines is determined by how many IDs there were, per matriline? I imagine this would involve reshaping my data above but when I tried to group_by(year,matr,id,.drop=FALSE) my data came out all wonky.
I want to change the color palete so that each color is very distinct - how do I attach a new color palette? I tried using this c25 palette with this code but it makes all my lines disappear.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=c25))
Thanks so much in advance!
Output of "id" (shortened to just the first five rows per column):
> dput(id)
structure(list(date = structure(c(8243, 8243, 8243, 8248, 8947,
class = "Date"), year = c(1992L, 1992L, 1992L, 1992L, 1994L),
event.id = c(8L, 8L, 8L, 10L, 11L), id = structure(c(51L, 55L, 59L,
46L, 51L), .Label = c("J11", "J16", "J17", "J2", "J22"),
class = "factor"), sex = structure(c(1L, 2L, 2L, 1L, 1L),
.Label = c("0", "1"), class = "factor"), age = c(28L, 12L, 6L, 42L,
30L), matr = structure(c(20L, 20L, 20L, 11L, 20L), .Label = c("J2",
"J4", "J7", "J9", "K11"), class = "factor"),
matralive = structure(c(2L, 2L, 2L, 2L, 2L),
.Label = c("0", "1"), class = "factor"), pod = structure(c(3L, 3L,
3L, 3L, 3L), .Label = c("J", "K", "L"), class = "factor")),
row.names = c(NA, -134L), class = c("tbl_df", "tbl", "data.frame"))
Output of events.bymatr:
> dput(events.bymatr)
structure(list(year = c(1992L, 1992L, 1992L, 1992L, 1992L),
matr = structure(c(1L, 2L, 3L, 4L, 5L), .Label = c("J2", "J4",
"J7", "J9", "K11"), class = "factor"), n = c(0L, 0L, 0L, 0L, 0L)),
row.names = c(NA, -380L), class = c("tbl_df", "tbl",
"data.frame"))
As #r2evans noted, it is surprisingly hard to distinguish clearly among more than a handful of colors. I used an example 20-color scale here that does a pretty good job, but even so a few can be tricky to distinguish. Here's an attempt using the storms dataset included with dplyr.
library(dplyr)
storms %>%
group_by(name, year) %>%
summarize(n = n(), .groups = "drop") %>% # = number of name per year View
tidyr::complete(name, year = 1975:2015, fill = list(n = 0)) %>%
group_by(name) %>%
mutate(total = sum(n)) %>% # = number of name overall
ungroup() %>%
filter(total %% 12 == 0) %>% # Arbitrary, to reduce scope of data for example
ggplot(aes(year, n, color = name, size = total, group = name)) +
geom_line() +
guides(color = guide_legend(override.aes = list(size = 3))) +
ggthemes::scale_color_tableau(palette = "Tableau 20")
I'm plotting a monthly time series of abundance using a bar chart, and there are some months where I don't have data to show. It is possible to add blank spaces y the categorical x-axis?
This is my data:
library(dplyr)
library(ggplot2)
structure(list(spp = structure(c(9L, 10L, 1L, 2L, 3L), .Label = c("sp10", "sp15", "sp16", "sp20", "sp21", "sp22", "sp23", "sp24", "sp8", "sp9"), class = "factor"), abundance = c(0, 0, 0, 0, 11.19404656), estation = c(5L, 5L, 5L,5L, 5L), year = c(1995L, 1995L, 1995L, 1995L, 1995L), month = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("abr", "dic", "ene", "feb"), class = "factor"), date = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("abr-96", "dic-95", "ene-96", "feb-96"), class = "factor")), row.names = c(NA, 5L), class = "data.frame")
subset_group <- subset %>% group_by(date,spp) %>% summarize(sum_pl = sum(abundance))
ggplot(subset_group, aes(x = date, y = sum_pl, fill = spp)) + geom_bar(stat = "identity") + scale_x_discrete(labels = c("dic-95" = "D", "ene-96" = "E", "feb-96" = "F", "M", "abr-96" = "A"))
I used scale_x_discrete(), but with no positive results.
I am trying to produce some charts of the dummy data at the bottom of this message and have a few questions.
Would it be recommended to generate a new dataframe with summary stats so that the Year column becomes unique and the second column provides the total count or can I work with the data as is?
Related to this, if I do want to create a new dataframe, what is the best way to make it so that it has: Year, TotalCount, Counts per Term, Counts per Society?
My dummyyearcount dataframe has been created using:
dummyyearcount <- count(dummydata, 'Year')
Is there a way to do multiple counts within the one line of code? If so, how?
Regarding the plots, I am looking to plot a cumulative line plot, however when running the code below, it is looking for a y axis value. Is there are a way to make it do a count of the number of publications within that year rather and then split it out by society or term as opposed to me having to output a summary table and feeding in the Total Count as the y-axis?
The code below is what I have for the line plot, which complains with:
"Error: geom_line requires the following missing aesthetics: y"
Also, how can I make this cumulative so in years of no publications it will just flat line?
ggplot() + aes(dummydata$Year, group=dummydata$Term, color=dummydata$Term) + geom_line(show.legend = TRUE) +
theme(axis.ticks=element_line(colour = 'black'), panel.background = element_rect('white'),
panel.grid.major = element_line(colour = 'gray85'), panel.border = element_rect(colour = 'black', fill = FALSE)) +
scale_y_continuous(expand = c(0,0), limits = c(0,5)) + scale_x_continuous(expand = c(0,0))
Output from dput():
structure(list(Year = c(2017L, 2011L, 2012L, 2010L, 2011L, 2015L,
2011L, 2011L, 2012L, 1994L, 2005L, 2009L, 1976L, 2007L, 2014L,
2013L, 2007L), Title = structure(1:17, .Label = c("Title of paper A",
"Title of paper B", "Title of paper C", "Title of paper D", "Title of paper E",
"Title of paper F", "Title of paper G", "Title of paper H", "Title of paper I",
"Title of paper J", "Title of paper K", "Title of paper L", "Title of paper M",
"Title of paper N", "Title of paper O", "Title of paper P", "Title of paper Q"
), class = "factor"), Authors = structure(c(1L, 1L, 2L, 1L, 3L,
4L, 7L, 1L, 8L, 5L, 4L, 6L, 10L, 10L, 9L, 4L, 2L), .Label = c("Bloggs",
"Jones", "Jones and Bloggs", "Smith", "Smith and Jones", "Smith, Jones and Wilson",
"White", "White and Bloggs", "Wilson", "Wilson and Jones"), class = "factor"),
Society = structure(c(4L, 4L, 1L, 1L, 4L, 4L, 2L, 3L, 4L,
1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L), .Label = c("ABC", "MNO",
"N", "XYZ"), class = "factor"), Term = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L
), .Label = c("A", "B"), class = "factor")), .Names = c("Year",
"Title", "Authors", "Society", "Term"), class = "data.frame", row.names = c(NA,
-17L))
An example plot of the look I am eventually wanting to achieve:
I am still very new to R so any help would be appreciated.
I like doing it like this using data.table package because it is quite tractable to me (but this is not the only way):
require(data.table)
# Turn data.frame into a data.table with term and year as group identifiers
setDT(dummydata ,key = c("Term","Year"))
# Get number of records in each group
dummydata[ , N := .N , by = .(Year,Term) ]
# Plot
ggplot( dummydata , aes( x = Year , y = cumsum(N) , colour = Term ) ) +
geom_line()
Using count function from plyr package to Count the number of occurrences.
#dummy data
df <- data.frame(Year = sample(1984:2014, 200, replace = TRUE), Title = sample(c("Paper A","Paper B","Paper C","Paper D","Paper E","Paper F","Paper G"), 200, replace = TRUE),Authors = sample(c("Stuart","Jerry","Kevin","Phil","Gru","Nefario","Phil","Josh"),200,replace = TRUE), Society = sample(c("lab1","lab2","lab3","lab4","lab5"),200,replace = TRUE),Term = sample(c("1st","2nd","3rd","4th"),200,replace = TRUE))
#grouping data based on society and year
library(plyr)
df.1 <- count(df, vars = c("Society","Year"))
#plotting the respective line plot
library(ggplot2)
p <- ggplot(df.1,aes(x = Year, y = freq, color = Society, group = Society)) + geom_line() + geom_point() + scale_x_continuous(breaks = df.1$Year)
p
Output Plot :
Additionally, if you want to add Term factor also in graph :
df.2 <- count(df, vars = c("Society","Year","Term"))
p2 <- ggplot(df.2,aes(x = Year, y = freq, color = Society, group = Society, shape = Term)) + geom_line() + geom_point(aes(size = Term)) + scale_x_continuous(breaks = df.2$Year)
p2
I try to show country data in a map using points in the map. Here the dataframe:
> dput(countries)
structure(list(country = structure(c(5L, 6L, 3L, 4L, 10L, 8L,
11L, 7L, 1L, 13L, 9L, 12L, 2L), .Label = c("Australia", "China",
"France", "Georgia", "India", "Ireland", "Malaysia", "Poland",
"Qatar", "Singapore", "South Africa", "Spain", "USA"), class = "factor"),
Latitude = c(20.593684, 53.142367, 46.227638, 32.165622,
1.352083, 51.919438, -30.559482, 4.210484, -25.274398, 37.09024,
25.354826, 40.463667, 35.86166), Longitude = c(78.96288,
-7.692054, 2.213749, -82.900075, 103.819836, 19.145136, 22.937506,
101.975766, 133.775136, -95.712891, 51.183884, -3.74922,
104.195397), Value = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 2L)), .Names = c("country", "Latitude", "Longitude",
"Value"), class = "data.frame", row.names = c(NA, -13L))
The code from here:
library(maps)
library(ggplot2)
base_world <- map_data("world")
map_data_coloured <-
base_world +
geom_point(data=countries,
aes(x=Longitude, y=Latitude, colour=Value), size=5, alpha=I(0.7))
But I receive this error:
Error in as.vector(x, mode) :
cannot coerce type 'environment' to vector of type 'any'
you need to pass the geom_polygon argument to map your base_world object
ggplot() +
geom_polygon(data=base_world, aes(x=long, y=lat, group=group)) +
geom_point(data=countries, aes(x=Longitude, y=Latitude, colour=Value), size=5, alpha=I(0.7))
In the code below inA in third line from the bottom of the function is not recognized/evaluated properly. I get
Error in parse(text = inA) : object 'inA' not found".
inA is recognized just fine in the two other lines above that where it is used. I have tried a LOT of permutations.
gcplot2 <- function (inraw,inA,inB){
inwork <- ddply(inraw,
.( eval(parse(text=inA)), eval(parse(text=inB)),BestYr),
summarize,
cases=sum(Cases,na.rm=TRUE),
pop=sum(Pop,na.rm=TRUE),
rate=round(100000*cases/pop,2))
names(inwork)[1] <- inA
names(inwork)[2] <- inB
#problem "inA" is here
x <- ggplot(inwork,aes(BestYr,rate, color=eval(parse(text=inA))))
x <- x + geom_line(size=1.5) + facet_wrap(as.formula(paste0("~ ",inB)))
print(x)
}
gcplot2(inraw=gc.full,"NewRace","Region")
Here is a small sample of the data frame that I hope can be used for a "reproducible example".
dput(temp2)
structure(list(LHJ = c("SACRAMENTO", "YOLO", "SAN BENITO", "COLUSA",
"STANISLAUS", "SAN DIEGO", "SHASTA", "TULARE", "MONTEREY", "KERN"
), BestYr = c(2010L, 2010L, 2010L, 2012L, 2012L, 2012L, 2011L,
2011L, 2010L, 2010L), Sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 1L), .Label = c("F", "M"), class = "factor"), RaceEth = structure(c(3L,
4L, 6L, 2L, 6L, 4L, 4L, 2L, 4L, 4L), .Label = c("A", "B", "H",
"O", "U", "W"), class = "factor"), AgeGrp = structure(c(1L, 4L,
5L, 2L, 7L, 2L, 4L, 2L, 3L, 1L), .Label = c("0-9", "10-14", "15-19",
"20-24", "25-29", "30-34", "35-44", "45+", "Unk"), class = "factor"),
Cases = c(NA, 0, 0, 0, 15.652173913, NA, 0, 0, 0, 0), Pop = c(32752.30608,
538.17138648, 444.83561193, 11.107216039, 14186.950585, 5486.3069863,
338.26814356, 245.3890448, 535.23711331, 2278.6798429), NewRace = c("Hispanic",
"Other", "White", "Black", "White", "Other", "Other", "Black",
"Other", "Other"), Region = structure(c(6L, 6L, 3L, 5L, 3L,
8L, 5L, 3L, 2L, 3L), .Label = c("Bay Area", "Central Coast",
"Central Inland", "Los Angeles", "Northern", "Sacramento Area",
"San Francisco", "Southern"), class = "factor")), .Names = c("LHJ",
"BestYr", "Sex", "RaceEth", "AgeGrp", "Cases", "Pop", "NewRace",
"Region"), row.names = c(41377L, 67523L, 42571L, 7418L, 59857L,
45051L, 54102L, 64260L, 32612L, 17538L), class = "data.frame")
I would do this as follows, with aes_string (and specifying the ddply variables via a character vector rather than .()):
gcplot2 <- function (inraw,inA,inB){
require("plyr")
require("ggplot2")
inwork <- ddply(inraw,
c(inA, inB,"BestYr"),
summarize,
cases=sum(Cases,na.rm=TRUE),
pop=sum(Pop,na.rm=TRUE),
rate=round(100000*cases/pop,2))
x <- ggplot(inwork,aes(BestYr,rate)) +
geom_line( aes_string(color=inA),size=1.5) +
facet_wrap(as.formula(paste0("~ ",inB)))
x
}
gcplot2(inraw=gc.full,"NewRace","Region")
I get warnings, but I think that's due to using a tiny subset of the data ...