I got coronavirus df and I need to compare Israel and UK data from the time both countries had more than 10 confirmed patients, this is my code :
library(ggplot2)
library(dplyr)
#Data frame
df.raw <- read.csv(url('https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv'))
str(df)
df <- df.raw
df$Date <- as.Date(df$Date)
str(df)
df.israel <- df %>% filter(Country == 'Israel', Confirmed>10)
df.uk <- df %>% filter(Country == 'United Kingdom', Confirmed>10)
if(df.israel$Date[1] > df.uk$Date[1]){
df.uk <- df.uk %>% filter(Date >= df.israel$Date[1])
} else {
df.israel <- df.israel %>% filter(Date >= df.uk$Date[1])
}
ggplot() +
geom_point(data = df.israel, aes(Date, Confirmed), color = 'blue') +
geom_point(data = df.uk, aes(Date,Confirmed), color = 'red')
Now, I need that my X axis will be numeric (1,2,3 etc) but I don't know how (tried xlim, scale_x_continuous) someone knows how to do this?
My graph
You can use match to get numbers instead of Date. Also it is better to get data in long format instead of creating two separate dataframes.
library(dplyr)
library(ggplot2)
df %>%
filter(Country %in% c('Israel', 'United Kingdom') & Confirmed>10) %>%
tidyr::pivot_longer(cols = Country) %>%
arrange(Date) %>%
mutate(day = match(Date, unique(Date))) %>%
ggplot() + aes(day, Confirmed, color = value) + geom_point() +
scale_color_manual(values = c('blue', 'red'))
Related
Going to try this again with a better MRE...for context, here's the product I'm currently trying to improve
What I'm trying to do is get the lines from the endpoints to the labels to be the same color as the data lines.
For purposes of this question we can work with this script
library(ggplot2)
library(babynames)
library(dplyr)
library(ggrepel)
library(ggsci)
data <- babynames %>%
filter(name %in% c("Ashley", "Patricia", "Mary", "Minnie")) %>%
filter(sex=="F")
data <- data %>% group_by(name) %>%
mutate(change = n - lag(n)) %>%
mutate(meanC = mean(change, na.rm = TRUE)) %>%
ungroup()
data$label <- paste(data$name,"\n",round(data$meanC,0),sep="" )
minYear = min(data$year)
maxYear = max(data$year)
#endpoint layer
Endpoints <- data %>%
group_by(name) %>%
filter(year == max(year)) %>%
select(year, name, n, label) %>%
ungroup()
namePlot <- data %>%
ggplot(mapping = aes(x=year, y=n)) +
geom_line(aes(color=name), show.legend = FALSE) +
coord_cartesian(xlim = c(minYear, maxYear+10)) +
scale_color_ucscgb() +
geom_point(data = Endpoints, size=1.5, shape=21,
aes(color=name, fill=name), show.legend=FALSE) +
geom_label_repel(data=Endpoints, aes(label=label),
color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)],
show.legend = FALSE,
vjust = 0, xlim=c(maxYear+3,maxYear+10), size=3, direction='y')
print(namePlot)
which produces this plot
The colors of the labels is controlled by color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)], so that, in this case, data with a positive value in the label is green and data with a negative value is red. What I'd like to is make the connecting lines from the endpoints to the label boxes be the same color as the data lines, which are controlled by geom_line(aes(color=name),show.legend = FALSE
In the ggrepel docs there is a segment.color parameter that can control the color of the line segment, but it is not an aesthetic. So it appears it has to be "hard-coded" like segment.color="red" which doesn't really help me. I also found this discussion about the issue that seemed to present a solution, but I have been unable to get it to work. Part of the issue there is that it involves scale_color_discrete(aesthetics = c("color", "segment.color")) and I already have scale_color_ucscgb() so I get a warning about replacing scales...
Any guidance would be most appreciated.
Working version based on guidance from #aosmith
library(ggplot2)
library(babynames)
library(dplyr)
library(ggrepel)
library(ggsci)
data <- babynames %>%
filter(name %in% c("Ashley", "Patricia", "Mary", "Minnie")) %>%
filter(sex=="F")
data <- data %>% group_by(name) %>%
mutate(change = n - lag(n)) %>%
mutate(meanC = mean(change, na.rm = TRUE)) %>%
ungroup()
data$label <- paste(data$name,"\n",round(data$meanC,0),sep="" )
minYear = min(data$year)
maxYear = max(data$year)
#endpoint layer
Endpoints <- data %>%
group_by(name) %>%
filter(year == max(year)) %>%
select(year, name, n, label) %>%
ungroup()
namePlot <- data %>%
ggplot(mapping = aes(x=year, y=n)) +
geom_line(aes(color=name), show.legend = FALSE) +
coord_cartesian(xlim = c(minYear, maxYear+15)) +
geom_point(data = Endpoints, size=1.5, shape=21,
aes(color=name, fill=name), show.legend=FALSE) +
geom_label_repel(data=Endpoints, aes(label=label,
segment.color=name),
color = c("forestgreen","red")[1+grepl("\\-\\d",Endpoints$label)],
show.legend = FALSE,
force = 50,
vjust = 0, xlim=c(maxYear+5,maxYear+12), size=3, direction='y') +
scale_color_discrete(aesthetics = c("color", "segment.color"))
print(namePlot)
produces
I would like to sort by ggplot facet_wrap by color.
For example, in this demo code, the color corresponds to groups A, B, C. I am looking to have all the red plots next to each other, and same for the blue and green plots.
I tried sorting my data by group but ggplot seems to switch the order when plotting.
library(tidyverse)
set.seed(42)
# Generate example data frame
id <- 1:15
data <- map(id, ~rnorm(10))
date <- map(id, ~1:10)
group <- map_chr(id, ~sample(c('a','b','c'), size=1))
df <- tibble(id=id, data=data, date=date, group=group) %>% unnest(cols = c(data, date))
# Generate plot
df %>%
arrange(group) %>%
ggplot(mapping = aes(x=date, y=data, color=group)) +
geom_line() +
geom_point() +
facet_wrap(~ id)
This could help:
library(tidyverse)
set.seed(42)
# Generate example data frame
id <- 1:15
data <- map(id, ~rnorm(10))
date <- map(id, ~1:10)
group <- map_chr(id, ~sample(c('a','b','c'), size=1))
df <- tibble(id=id, data=data, date=date, group=group) %>% unnest(cols = c(data, date))
df2 <- df %>% mutate(id=factor(id))%>%
group_by(group) %>%
mutate(N = n()) %>%
ungroup() %>%
mutate(id = fct_reorder(id, N))
# Generate plot
df2 %>%
arrange(group) %>%
ggplot(mapping = aes(x=date, y=data, color=group)) +
geom_line() +
geom_point() +
facet_wrap(~ id)
This would be a way (would have to get rid of the double title though):
df %>%
arrange(group) %>%
ggplot(mapping = aes(x=date, y=data, color=group)) +
geom_line() +
geom_point() +
facet_wrap(~ group + id)
I am trying to use ggplot2 to plot a date column vs. a numeric column.
I have a dataframe that I am trying to manipulate with country as either china or not china, and successfully created the dataframe linked below with:
is_china <- confirmed_cases_worldwide %>%
filter(country == "China", type=='confirmed') %>%
group_by(country) %>%
mutate(cumu_cases = cumsum(cases))
is_not_china <- confirmed_cases_worldwide %>%
filter(country != "China", type=='confirmed') %>%
mutate(cumu_cases = cumsum(cases))
is_not_china$country <- "Not China"
china_vs_world <- rbind(is_china,is_not_china)
Now essentially I am trying to plot a line graph with cumu_cases and date between "china" and "not china"
I am trying to execute this code:
plt_china_vs_world <- ggplot(china_vs_world) +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases")
Now I keep getting a graph looking like this:
Don't understand why this is happening, been trying to convert data types and other methods.
Any help is appreciated, I linked both csv below
https://github.com/king-sules/Covid
The 'date' for other 'country' are repeated because the 'country' is now changed to 'Not China'. It would be either changed in the OP's 'is_not_china' step or do this in 'china_vs_world'
library(ggplot2)
library(dplyr)
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases")
-output
NOTE: It is the scale that shows the China numbers to be small.
As #Edward mentioned a log scale would make it more easier to understand
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases") +
scale_y_continuous(trans='log')
Or with a facet_wrap
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases") +
facet_wrap(~ country, scales = 'free_y')
data
china_vs_world <- read.csv("https://raw.githubusercontent.com/king-sules/Covid/master/china_vs_world.csv", stringsAsFactors = FALSE)
china_vs_world$date <- as.Date(china_vs_world$date)
My dataset looks like below,
dat <- data.frame(ID = c(150,151,155,155,155,155,150), year = c(1995,2011,2012,2012,2013,2012,2013), Acceptance = c(no,yes,yes,yes,yes,no,no));
I wanted to plot a bar chart, for ID 155, with X-axis over the Year, and var 3 Which shows only Yes.
I have tried the below code
cl_d <- dat %>%
filter(ID==155)%>%
filter(year(Date)>2000)%>%
group_by(ID, year)%>%
summarise(count=n())
ggplot(cl_d, aes(year, count))+
geom_bar(stat='identity')
The bar plot should show the count of Acceptance for "Yes" over the Date greater than 2000 for the particular ID 155
Hey this code should work I alway try to avoid plugins if you have any questions left just ask!
dat <- data.frame(c(150,151,155,155,155,155,150),
c(1995,2011,2012,2012,2013,2012,2013),
c("no","yes","yes","yes","yes","no","no"))
colnames(dat)[1] <- "ID"
colnames(dat)[2] <- "Date"
colnames(dat)[3] <- "claim_count1"
NewData <- dat[dat$ID==155 & dat$Date > 2000 & dat$claim_count1== "yes",]
ggplot(data=NewData, aes(x=Date)) + geom_bar(stat ="count")
This?
dat %>%
filter(ID==155)%>%
filter(Acceptance == "yes") %>%
filter(year>2000) %>%
group_by(year) %>%
count() %>%
ggplot(aes(year, n))+
geom_col()
It appears you want year to be in date format and the graph to also be in the date format. If this is the case see the code below:
dat <- data.frame(ID = c(150,151,155,155,155,155,150),
year = c(1995,2011,2012,2012,2013,2012,2013),
Acceptance = c("no","yes","yes","yes","yes","no","no"))
dat$year <- as.Date(ISOdate(dat$year, 1, 1))
cl_d <- dat %>% filter(ID==155) %>%
subset(year > as.Date("2000-01-01")) %>%
group_by(ID, year) %>%
summarise(count=n())
ggplot(cl_d, aes(year, count)) +
geom_bar(stat='identity') +
scale_x_date(date_labels ="%Y", date_breaks = "1 year")
Is this what you're after?
library(tidyverse);
dat %>%
filter(ID == 155 & year >= 2000 & Acceptance == "yes") %>%
count(ID, year) %>%
ggplot(aes(as.factor(year), n)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Count")
Sample data
dat <- data.frame(
ID = c(150,151,155,155,155,155,150),
year = c(1995,2011,2012,2012,2013,2012,2013),
Acceptance = c("no","yes","yes","yes","yes","no","no"));
I have an object with several values including cities, states, year and number of murders. I use dplyr to group it by city and calculate the total murders over all years for the top 10 cities like this:
MurderNb_reshaped2 %>%
select(city, state, Year, Murders) %>%
group_by(city) %>%
summarise(total = sum(Murders)) %>%
top_n(10, total) %>%
ggplot(aes(x = Year, y = Murders, fill = "red")) +
geom_histogram(stat = "identity") +
facet_wrap(~city)
I would like to plot this for only the top ten cities, but 'x = year' is not found because it has been grouped by city. Can anyone explain how I can accomplish this?
EDIT: this the original source data https://interactive.guim.co.uk/2017/feb/09/gva-data/UCR-1985-2015.csv
And here is my code:
Deaths <- read.csv("UCR-1985-2015.csv", stringsAsFactors = F)
MurderRate <- Deaths[, -c(5:35)]
MurderNb <- Deaths[, -c(36:66)]
colnames(MurderNb) <- gsub("X", "", colnames(MurderNb))
colnames(MurderNb) <- gsub("_raw_murder_num", "", colnames(MurderNb))
MurderNb_reshaped <- melt(MurderNb, id = c("city", "Agency", "state", "state_short"))
colnames(MurderNb_reshaped) <- c("city", "Agency", "state", "state_short", "Year", "Murders")
MurderNb_reshaped2 <- MurderNb_reshaped
MurderNb_reshaped2 %>%
select(city, state, Year, Murders) %>%
group_by(city) %>%
summarise(total = sum(Murders)) %>%
top_n(10, total) %>%
ggplot(aes(x = Year, y = Murders, fill = "red")) +
geom_bar(stat = "identity") +
facet_wrap(~city)
Ok there were a couple minor issue. This should do the trick:
#this gives you the top cities
topCities <- MurderNb_reshaped2 %>%
select(city, state, Year, Murders) %>%
group_by(city) %>%
summarise(total = sum(Murders)) %>%
top_n(10, total)
#you then need to filter your original data to be only the data for the top cities
MurderNb_reshaped2 <- filter(MurderNb_reshaped2, city %in% topCities$city)
ggplot(data = MurderNb_reshaped2, aes(x = Year, y = Murders, fill = "red")) +
geom_bar(stat = "identity") +
facet_wrap(~city)