I am working on the Tidy Tuesday data this week and ran into my geom_area doing what I think is overlapping the data. If I facet_wrap the data then there are no missing values in any year, but as soon as I make an area plot and fill it the healthcare/education data seems to disappear.
Below are example plots of what I mean.
library(tidyverse)
chain_investment <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-10/chain_investment.csv')
plottable_investment <- chain_investment %>%
filter(group_num == c(12,17)) %>%
mutate(small_cat = case_when(
group_num == 12 ~ "Transportation",
group_num == 17 ~ "Education/Health"
)) %>%
group_by(small_cat, year, category) %>%
summarise(sum(gross_inv_chain)) %>%
ungroup %>%
rename(gross_inv_chain = 4)
# This plot shows that there is NO missing education, health, or highway data
# Goal is to combine the data on one plot and fill based on the category
plottable_investment %>%
ggplot(aes(year, gross_inv_chain)) +
geom_area() +
facet_wrap(~category)
# Some of the data in the health category gets lost? disappears? unknown
plottable_investment %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area()
# Something is going wrong here?
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area(position = "identity")
# The data is definitely there
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain)) +
geom_area() +
facet_wrap(~category)
The issue is that you filtered your data using == instead of using %in%.
In your case using == has the subtle side effect that for some categories (e.g. Health) your filtered data contains only obs for even years, while for others (e.g. Education) we end up with obs for only uneven years. As a result you end up with "two" area charts which overlap each other.
This could be easily seen by switching to geom_col which gives you a "dodged" bar plot as we have only one category per year.
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_col()
Using %in% instead gives the desired stacked area chart with all observations per category:
plottable_investment1 <- chain_investment %>%
filter(group_num %in% c(12,17)) %>%
mutate(small_cat = case_when(
group_num == 12 ~ "Transportation",
group_num == 17 ~ "Education/Health"
)) %>%
group_by(small_cat, year, category) %>%
summarise(gross_inv_chain = sum(gross_inv_chain)) %>%
ungroup()
#> `summarise()` has grouped output by 'small_cat', 'year'. You can override using the `.groups` argument.
plottable_investment1 %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area()
I am trying to use ggplot2 to plot a date column vs. a numeric column.
I have a dataframe that I am trying to manipulate with country as either china or not china, and successfully created the dataframe linked below with:
is_china <- confirmed_cases_worldwide %>%
filter(country == "China", type=='confirmed') %>%
group_by(country) %>%
mutate(cumu_cases = cumsum(cases))
is_not_china <- confirmed_cases_worldwide %>%
filter(country != "China", type=='confirmed') %>%
mutate(cumu_cases = cumsum(cases))
is_not_china$country <- "Not China"
china_vs_world <- rbind(is_china,is_not_china)
Now essentially I am trying to plot a line graph with cumu_cases and date between "china" and "not china"
I am trying to execute this code:
plt_china_vs_world <- ggplot(china_vs_world) +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases")
Now I keep getting a graph looking like this:
Don't understand why this is happening, been trying to convert data types and other methods.
Any help is appreciated, I linked both csv below
https://github.com/king-sules/Covid
The 'date' for other 'country' are repeated because the 'country' is now changed to 'Not China'. It would be either changed in the OP's 'is_not_china' step or do this in 'china_vs_world'
library(ggplot2)
library(dplyr)
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases")
-output
NOTE: It is the scale that shows the China numbers to be small.
As #Edward mentioned a log scale would make it more easier to understand
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases") +
scale_y_continuous(trans='log')
Or with a facet_wrap
china_vs_world %>%
group_by(country, date) %>%
summarise(cumu_cases = sum(cases)) %>%
ungroup %>%
mutate(cumu_cases = cumsum(cumu_cases)) %>%
ggplot() +
geom_line(aes(x=date,y=cumu_cases,group=country,color=country)) +
ylab("Cumulative confirmed cases") +
facet_wrap(~ country, scales = 'free_y')
data
china_vs_world <- read.csv("https://raw.githubusercontent.com/king-sules/Covid/master/china_vs_world.csv", stringsAsFactors = FALSE)
china_vs_world$date <- as.Date(china_vs_world$date)
I got coronavirus df and I need to compare Israel and UK data from the time both countries had more than 10 confirmed patients, this is my code :
library(ggplot2)
library(dplyr)
#Data frame
df.raw <- read.csv(url('https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv'))
str(df)
df <- df.raw
df$Date <- as.Date(df$Date)
str(df)
df.israel <- df %>% filter(Country == 'Israel', Confirmed>10)
df.uk <- df %>% filter(Country == 'United Kingdom', Confirmed>10)
if(df.israel$Date[1] > df.uk$Date[1]){
df.uk <- df.uk %>% filter(Date >= df.israel$Date[1])
} else {
df.israel <- df.israel %>% filter(Date >= df.uk$Date[1])
}
ggplot() +
geom_point(data = df.israel, aes(Date, Confirmed), color = 'blue') +
geom_point(data = df.uk, aes(Date,Confirmed), color = 'red')
Now, I need that my X axis will be numeric (1,2,3 etc) but I don't know how (tried xlim, scale_x_continuous) someone knows how to do this?
My graph
You can use match to get numbers instead of Date. Also it is better to get data in long format instead of creating two separate dataframes.
library(dplyr)
library(ggplot2)
df %>%
filter(Country %in% c('Israel', 'United Kingdom') & Confirmed>10) %>%
tidyr::pivot_longer(cols = Country) %>%
arrange(Date) %>%
mutate(day = match(Date, unique(Date))) %>%
ggplot() + aes(day, Confirmed, color = value) + geom_point() +
scale_color_manual(values = c('blue', 'red'))
I'm animating a map with percentage of deaths in Africa caused by HIV/AIDS. For some years the animation works well, but for other years the countries are sort of jumping around. The data can be found here. My code is shown below
library(sf)
library(rworldmap)
library(transformr)
library(gganimate)
library(tidyverse)
mortality <– read_csv("path_to_file")
africa_map <- getMap(resolution = "low") %>% st_as_sf() %>%
filter(continent == "Africa")
mortality %>% filter(region == "Africa", disease == "HIV/AIDS") %>%
mutate(year = as.integer(year(year))) %>% drop_na() %>%
left_join(africa_map, by = c("country_code" = "SOV_A3")) %>%
ggplot() + geom_sf(aes(fill = percent)) +
transition_time(year) +
labs(title = "Year: {frame_time}")
Any idea how to fix this?
My dataset looks like below,
dat <- data.frame(ID = c(150,151,155,155,155,155,150), year = c(1995,2011,2012,2012,2013,2012,2013), Acceptance = c(no,yes,yes,yes,yes,no,no));
I wanted to plot a bar chart, for ID 155, with X-axis over the Year, and var 3 Which shows only Yes.
I have tried the below code
cl_d <- dat %>%
filter(ID==155)%>%
filter(year(Date)>2000)%>%
group_by(ID, year)%>%
summarise(count=n())
ggplot(cl_d, aes(year, count))+
geom_bar(stat='identity')
The bar plot should show the count of Acceptance for "Yes" over the Date greater than 2000 for the particular ID 155
Hey this code should work I alway try to avoid plugins if you have any questions left just ask!
dat <- data.frame(c(150,151,155,155,155,155,150),
c(1995,2011,2012,2012,2013,2012,2013),
c("no","yes","yes","yes","yes","no","no"))
colnames(dat)[1] <- "ID"
colnames(dat)[2] <- "Date"
colnames(dat)[3] <- "claim_count1"
NewData <- dat[dat$ID==155 & dat$Date > 2000 & dat$claim_count1== "yes",]
ggplot(data=NewData, aes(x=Date)) + geom_bar(stat ="count")
This?
dat %>%
filter(ID==155)%>%
filter(Acceptance == "yes") %>%
filter(year>2000) %>%
group_by(year) %>%
count() %>%
ggplot(aes(year, n))+
geom_col()
It appears you want year to be in date format and the graph to also be in the date format. If this is the case see the code below:
dat <- data.frame(ID = c(150,151,155,155,155,155,150),
year = c(1995,2011,2012,2012,2013,2012,2013),
Acceptance = c("no","yes","yes","yes","yes","no","no"))
dat$year <- as.Date(ISOdate(dat$year, 1, 1))
cl_d <- dat %>% filter(ID==155) %>%
subset(year > as.Date("2000-01-01")) %>%
group_by(ID, year) %>%
summarise(count=n())
ggplot(cl_d, aes(year, count)) +
geom_bar(stat='identity') +
scale_x_date(date_labels ="%Y", date_breaks = "1 year")
Is this what you're after?
library(tidyverse);
dat %>%
filter(ID == 155 & year >= 2000 & Acceptance == "yes") %>%
count(ID, year) %>%
ggplot(aes(as.factor(year), n)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Count")
Sample data
dat <- data.frame(
ID = c(150,151,155,155,155,155,150),
year = c(1995,2011,2012,2012,2013,2012,2013),
Acceptance = c("no","yes","yes","yes","yes","no","no"));