how to plot lines matching data using ggplot2 - r

plot increase_rate contains abs(increase_rate) > 0.05.
but under the code, discard the data between -0.05 to 0.05.
I also plot data including from -0.05 to 0.05 range.
library(tidyverse)
data(population, package="tidyr")
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
filter(abs(increase_rate) > 0.05) %>%
ungroup %>%
ggplot()+
geom_line(aes(x = year, y = increase_rate, color = country))
I want to get final plot like this.
d <-
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
ungroup
select_country <-
d %>% filter(!between(increase_rate, -0.05, 0.05)) %>%
select(country) %>% distinct %>% unlist
d %>%
filter(country %in% select_country) %>%
ggplot()+
geom_line(aes(x = year, y = increase_rate, color = country))

use between:
filter(!between(increase_rate, -0.05, 0.05))

add column using mutate() function
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
mutate(judge = max(abs(increase_rate), na.rm=T)) %>%
filter(judge > 0.05) %>%
ungroup %>%
ggplot() +
geom_line(aes(x = year, y = increase_rate, color = country))

Related

echarts4r: fix axis range across groups

I am creating a grouped bar chart like so:
library(tidyverse)
library(echarts4r)
data("starwars")
starwars %>%
group_by(sex, eye_color) %>%
summarise(height = mean(height, na.rm=TRUE)) %>%
group_by(sex) %>%
e_charts(x = eye_color, timeline = TRUE) %>%
e_bar(height, legend = FALSE)
How do I set the range of the y axis (height) to be the same across groups (sex)?
You could set maximum value for the y axis using e_y_axis(max = XXX), e.g. in the code below I set the max value based on the maximum of height.
library(tidyverse)
library(echarts4r)
data("starwars")
ymax <- max()
dat <- starwars %>%
group_by(sex, eye_color) %>%
summarise(height = mean(height, na.rm=TRUE), .groups = "drop")
ymax <- 50 * ceiling(max(dat$height, na.rm = TRUE) / 50)
dat %>%
group_by(sex) %>%
e_charts(x = eye_color, timeline = TRUE) %>%
e_bar(height, legend = FALSE) %>%
e_y_axis(max = ymax)

Why do I get different frequencies depending of the time I apply group_by() and distinct() in R?

I am quite new to R and the tidyverse, and I can't wrap my head around the following:
Why do I get a different frequencies depending on when I group_by() and distinct() my data?
output_df_1 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
group_by(created_at) %>%
count(created_at)
output_df_2 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
group_by(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)
full_join(output_df_1 , output_df_2 , by = "created_at") %>%
rename(output_df_1 = n.x,
output_df_2 = n.y) %>%
melt(id = "created_at") %>%
ggplot()+
geom_line(aes(x=created_at, y=value, colour=variable),
linetype = "solid",
size = 0.75) +
scale_colour_manual(values=c("#005293","#E37222"))
Context
input_df is a dataframe containing observations of tweets with timestamps and author_ids. I would like to produce a plot with variable1 being tweets per hour (this poses no problem) and variable2 being distict users per hour. I am not sure which of the two lines in the above plot correcly visualizes the distinct users per hour.
It is because in the first code, you use distinct before group_by and count.
Morover it is the use of group_by. count automatically also groups:
count is same as group_by(cyl) %>% summarise(freq=n()).
Here is an example:
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
gives:
> mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
> mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
If you change the order of distinct:
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
mtcars %>%
count(cyl) %>%
distinct(am, .keep_all=TRUE)
you get:
mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
>
> mtcars %>%
+ count(cyl) %>%
+ distinct(am, .keep_all=TRUE)
Error: `distinct()` must use existing variables.
x `am` not found in `.data`.
In your example, this code should give the same result for df1 and df2:
output_df_1 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)
output_df_2 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)

How to loop through columns in R to create plots?

I have three columns in a dataframe: age, gender and income.
I want to loop through these columns and create plots based on the data in them.
I know in stata you can loop through variables and then run commands with those variables. However the code below does not seem to work, is there an equivalent way to do what I want to do in R?
groups <- c(df$age, df$gender, df$income)
for (i in groups){
df %>% group_by(i) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
ggplot(aes(y = prop, x = i)) +
geom_col()
}
you can also use the tidyverse. Loop through a vector of grouping variable names with map. On every iteration, you can evaluate !!sym(variable) the variable name to group_by. Alternatively, we can use across(all_of()), wihch can take strings directly as column names. The rest of the code is pretty much the same you used.
library(dplyr)
library(purrr)
groups <- c('age', 'gender', 'income')
## with !!(sym(.x))
map(groups, ~
df %>% group_by(!!sym(.x)) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
ggplot(aes(y = prop, x = i)) +
geom_col()
)
## with across(all_of())
map(groups, ~
df %>% group_by(across(all_of(.x))) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
ggplot(aes(y = prop, x = i)) +
geom_col()
)
If you want to use a for loop:
groups <- c('age', 'gender', 'income')
for (i in groups){
df %>% group_by(!!sym(i)) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
ggplot(aes(y = prop, x = i)) +
geom_col()
}
You can use lapply
df <- data.frame(age = sample(c("26-30", "31-35", "36-40", "41-45"), 20, replace = T),
gender = sample(c("M", "F"), 20, replace = T),
income = sample(c("High", "Medium", "Low"), 20, replace = T),
prop = runif(20))
lapply(df[,c(1:3)], function(x) ggplot(data = df, aes(y = df$prop, x = x))+ geom_col())

Plotting the average of multiple time series objects and illustrating the error from that plot

Consider dat created here:
set.seed(123)
ID = factor(letters[seq(6)])
time = c(100, 102, 120, 105, 109, 130)
dat <- data.frame(ID = rep(ID,time), Time = sequence(time))
dat$group <- rep(c("GroupA","GroupB"), c(322,344))
dat$values <- sample(100, nrow(dat), TRUE)
dat contains time series data for 6 individuals (6 IDs), which belong to 2 groups (GroupA and GroupB). Assume that we expect the time series within each group to have similar properties. Also note that the time series for each individual is of different length. We essentially want to create an "average" time series plot of each group, which I have done like this:
library(dplyr)
library(ggplot2)
dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)
How can we do this same thing, but add the original plots for each individual behind the "average" plots to illustrate the error associated with each "average"? Note that The way I created the "average plot" was by using the length of the ID with the shortest time series from each group, but when the originals are added, I would like to see the whole plots from the originals if possible (so some will be longer than others)
Using a second geom_line you can plot the "raw" data in the background as e.g. grey lines.
set.seed(123)
ID = factor(letters[seq(6)])
time = c(100, 102, 120, 105, 109, 130)
dat <- data.frame(ID = rep(ID,time), Time = sequence(time))
dat$group <- rep(c("GroupA","GroupB"), c(322,344))
dat$values <- sample(100, nrow(dat), TRUE)
library(dplyr)
library(ggplot2)
d <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values))
#> `summarise()` regrouping output by 'group' (override with `.groups` argument)
ggplot()+
geom_line(data = dat, aes(Time, values, group = ID), color = "grey80", alpha = .7) +
geom_line(data = d, aes(Time, values, colour = group)) +
facet_wrap(.~group)
Maybe you are looking for a composed plot like this:
library(dplyr)
library(ggplot2)
library(patchwork)
G1 <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)+
ylab('Mean')
G2 <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)+
ylab('Real Values')
#Compose plots
G3 <- G2/G1+plot_layout(guides = "collect")
Output:

Second Y Axis In Facet Wrap with Line and Histogram (Tidyverse)

Trying to plot total cases of covid19 at the country level with a histogram of daily new cases to show a sustained drop in new cases leads to a 'flattening of the curve' (assuming that is the case).
library(tidyverse)
#clean raw data source
c19 = read_csv("https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv") %>%
mutate(Cases = Confirmed) %>%
mutate(Country = `Country/Region`) %>%
select(Date, Country, Cases, Deaths) %>%
group_by(Date, Country) %>%
summarise(Cases = sum(Cases),
Deaths = sum(Deaths)) %>%
ungroup() %>%
group_by(Country) %>%
mutate(Lagged_Cases = ifelse(is.na(lag(Cases)), 0, lag(Cases))) %>%
mutate(NewCases = Cases - Lagged_Cases) %>%
mutate(IndexDate = ifelse(Lagged_Cases == 0 & Cases > 0, 1, ifelse(Lagged_Cases > 0, 2, 0))) %>%
filter(IndexDate > 0) %>%
mutate(Index = row_number()) %>%
ungroup() %>%
select(-IndexDate) %>%
filter(Country %in% c("US","Korea, South","Sweden")) %>%
inner_join(data.frame(Country = c("US","Korea, South","Sweden"),
Pop = c(328000000,51245707,10230000)))
c19 %>%
ggplot() +
geom_line(aes(x=Index, y=Cases/1000, color=Country), size=2) +
geom_histogram(aes(x=Index, y=NewCases/75, group=Country), stat="identity", alpha=.4) +
#scale_y_continuous(sec.axis = sec_axis(~./data$Cases)) +
facet_wrap(vars(Country), scales="free_y") +
ggtitle("Flattening The Curve?") +
xlab("Days Since First Case") +
ylab("Total Cases (thousands) - Daily New Cases (not to scale)")

Resources