Here is one of the dataset I have been practicing on R-cloud and this the approach:
data_long %>%
filter(Indicator == "Emissions")
glimpse(data_long)
## This is suppose to return 2 columns Year & Emissions, it ends up aggregating values:
## Emissions
## 1417688795
data_long %>%
filter(Indicator == "Emissions") %>%
group_by(Year) %>%
summarize(Emissions = sum(Value))
## Another -- Error in `check_aesthetics()`:
## ! Aesthetics must be either length 1 or the same as the data (1): x
data_long %>%
filter(Indicator == "Emissions") %>%
group_by(Year) %>%
summarize(Emissions = sum(Value)) %>%
ggplot(aes(x = Year, y = Emissions)) +
geom_line(size = 1.5)
I am assuming there is a issue with one of the packages (gradethis) or (learnr) as I am using R-cloud, are there any working limitations with it? any suggestions?
Thanks,
Vikram
Related
I am quite new to R and the tidyverse, and I can't wrap my head around the following:
Why do I get a different frequencies depending on when I group_by() and distinct() my data?
output_df_1 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
group_by(created_at) %>%
count(created_at)
output_df_2 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
group_by(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)
full_join(output_df_1 , output_df_2 , by = "created_at") %>%
rename(output_df_1 = n.x,
output_df_2 = n.y) %>%
melt(id = "created_at") %>%
ggplot()+
geom_line(aes(x=created_at, y=value, colour=variable),
linetype = "solid",
size = 0.75) +
scale_colour_manual(values=c("#005293","#E37222"))
Context
input_df is a dataframe containing observations of tweets with timestamps and author_ids. I would like to produce a plot with variable1 being tweets per hour (this poses no problem) and variable2 being distict users per hour. I am not sure which of the two lines in the above plot correcly visualizes the distinct users per hour.
It is because in the first code, you use distinct before group_by and count.
Morover it is the use of group_by. count automatically also groups:
count is same as group_by(cyl) %>% summarise(freq=n()).
Here is an example:
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
gives:
> mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
> mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
If you change the order of distinct:
mtcars %>%
distinct(am, .keep_all=TRUE) %>%
count(cyl)
mtcars %>%
count(cyl) %>%
distinct(am, .keep_all=TRUE)
you get:
mtcars %>%
+ distinct(am, .keep_all=TRUE) %>%
+ count(cyl)
cyl n
1 6 2
>
> mtcars %>%
+ count(cyl) %>%
+ distinct(am, .keep_all=TRUE)
Error: `distinct()` must use existing variables.
x `am` not found in `.data`.
In your example, this code should give the same result for df1 and df2:
output_df_1 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)
output_df_2 <- input_df %>%
mutate(created_at = lubridate::floor_date(created_at, unit = "hours")) %>%
select(created_at, author_id) %>%
arrange(created_at) %>%
distinct(author_id, .keep_all = T) %>%
count(created_at)
I am trying to reorder the geom_col plot by one of the factors pct_female_vacc used below in plot of the variable pct_vacc_GenderType.
df
library(tidyverse)
library(lubridate)
library(scales)
library(gganimate)
file_url1 <- "https://raw.githubusercontent.com/johnsnow09/covid19-df_stack-code/main/cowin_vaccine_data_statewise.csv"
df_vaccination <- read.csv(url(file_url1))
df_vaccination <- df_vaccination %>%
mutate(Updated.On = as.Date(Updated.On))
plot
df_vaccination %>%
filter(State != "India",
Updated.On == max(Updated.On)) %>%
# arrange(desc(Updated.On)) %>%
mutate(pct_female_vacc = Female.Individuals.Vaccinated./Total.Individuals.Vaccinated,
pct_male_vacc = Male.Individuals.Vaccinated./Total.Individuals.Vaccinated,
State = as.factor(State)
) %>%
pivot_longer(cols = c(pct_female_vacc:pct_male_vacc),
names_to = "pct_vacc_GenderType",
values_to = "pct_vacc") %>%
mutate(pct_vacc_GenderType = as.factor(pct_vacc_GenderType)) %>%
na.omit() %>%
ggplot(aes(x = pct_vacc, y = State ,
fill = pct_vacc_GenderType)) +
geom_col()
I am looking to get above plot to be reordered by red color i.e pct_female_vacc factor.
Unable to use reorder_within as I have not used facet_wrap here. Also tried fct_reorder but may be I am not doing it right or even that doesn't work in this case.
What you want to do is simple with forcats::fct_reorder. The only thing you have to be cautious about is that you need to set the factor before pivot_longer. Here you go:
df_vaccination %>%
filter(State != "India",
Updated.On == max(Updated.On) - 1) %>% # the newest date contains only NAs, so I use the second oldest
# arrange(desc(Updated.On)) %>%
mutate(pct_female_vacc = Female.Individuals.Vaccinated./Total.Individuals.Vaccinated,
pct_male_vacc = Male.Individuals.Vaccinated./Total.Individuals.Vaccinated,
State = as.factor(State)
) %>%
mutate(State = forcats::fct_reorder(State, pct_female_vacc)) %>% # since you pivot longer in the next step you have to order your factors here
pivot_longer(cols = c(pct_female_vacc:pct_male_vacc),
names_to = "pct_vacc_GenderType",
values_to = "pct_vacc") %>%
mutate(pct_vacc_GenderType = as.factor(pct_vacc_GenderType)) %>%
filter(!is.na(pct_vacc)) %>% # use this instead of na.omit() to remove NAs
ggplot(aes(x = pct_vacc, y = State ,
fill = pct_vacc_GenderType)) +
geom_col() +
theme(legend.position = "bottom") # I moved the legend to the bottom so it looks better on for stackoverflow
Created on 2021-05-16 by the reprex package (v2.0.0)
arrange the data by pct_female_vacc and change the State to factor based on appearance.
library(tidyverse)
df_vaccination %>%
filter(State != "India",
Updated.On == max(Updated.On)) %>%
mutate(pct_female_vacc = `Female.Individuals.Vaccinated.`/Total.Individuals.Vaccinated,
pct_male_vacc = Male.Individuals.Vaccinated./Total.Individuals.Vaccinated) %>%
arrange(pct_female_vacc) %>%
mutate(State = factor(State, unique(State))) %>%
pivot_longer(cols = c(pct_female_vacc:pct_male_vacc),
names_to = "pct_vacc_GenderType",
values_to = "pct_vacc") %>%
na.omit() %>%
ggplot(aes(x = pct_vacc, y = State ,
fill = pct_vacc_GenderType)) +
geom_col()
I tried to convert the month variable (which is an integer) into a categorical variable using factor(month), but I failed because of the error. How could I solve it?
This is my code:
library(tidyverse)
library(dplyr)
install.packages("nycflights13")
library(nycflights13)
month_new <- flights$month
month_new
flights %>%
filter(dest == "HNL", air_time > 10) %>%
factor(month_new) %>%
ggplot(x = month_new) + geom_bar()
Your assignment factor(month_new) does not work. I suggest mutate(month = as.factor(month)) and there is no aesthetics aes
library(tidyverse)
#install.packages("nycflights13")
library(nycflights13)
flights %>%
filter(dest == "HNL", air_time > 10) %>%
mutate(month = as.factor(month)) %>%
ggplot(aes(x = month)) +
geom_bar()
or:
library(tidyverse)
#install.packages("nycflights13")
library(nycflights13)
flights %>%
filter(dest == "HNL", air_time > 10)
ggplot(flights, aes(x=factor(month)))+
geom_bar(fill="steelblue")+
theme_minimal()
I'm trying to add labels and percentages to each layer within a sunburst chart using R - so it looks like this Sunburst.
I can create a sunburst chart (using this guide) but I can't figure out how to add the labels or percentages. I also want to be able to print the chart with all labels and percentages.
Here's my code so far.
# libraries
library(dplyr)
library(treemap)
library(sunburstR)
library(readxl)
library(vcd)
## Load Arthritis as example
Data <- data.frame(Arthritis)
Data <- Data %>% select(-ID) %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>% group_by(Treatment,Sex,Improved,Age) %>%
summarise(Count=n()) %>%
mutate(Path=paste(Treatment,Sex,Improved,Age,sep="-")) %>%
ungroup() %>%
select(Path,Count)
sunburst(Data)
Any help would be great.
Thanks.
I suggest the ggsunburst package https://github.com/didacs/ggsunburst
library(ggsunburst)
library(dplyr)
library(vcd) # just for the Arthritis dataset
Data <- data.frame(Arthritis)
# compute percentage using tally
# add column leaf, with format "name->attribute:value"
# ggsunburst considers everything after "->" as attributes
# the attribute "size" is used as the size of the arc
df <- Data %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>%
group_by(Treatment,Sex,Improved,Age) %>%
tally() %>%
mutate(percentage = n/nrow(Data)*100,
size=paste("->size:",round(percentage,2),sep=""),
leaf=paste(Improved,size,sep = "")) %>%
ungroup() %>%
select(Treatment,Sex,Age,leaf)
# sunburst_data reads from a file so you need to create one
write.table(df, file = 'data.csv', row.names = F, col.names = F, sep = ",")
# specify node_attributes = "size" to add labels with percentages in terminal nodes
sb <- sunburst_data('data.csv', type = "lineage", sep = ',', node_attributes = "size")
# compute percentages for internal nodes
tre <- Data %>%
group_by(Treatment) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Treatment) %>%
ungroup() %>%
select(name,percent)
sex <- Data %>%
group_by(Treatment,Sex) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Sex) %>%
ungroup() %>%
select(name,percent)
age <- Data %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>%
group_by(Treatment,Sex,Age) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Age) %>%
ungroup() %>%
select(name,percent)
x <- rbind(tre, sex, age)
# the rows in x are in the same order as sb$node_labels, cbind works here only because of that
x <- cbind(sb$node_labels, round(x[,"percent"],2))
percent <- x %>% mutate(name_percent = paste(label,percent,"%"))
sunburst(sb, node_labels.min = 0) +
geom_text(data = sb$leaf_labels, aes(x=x, y=0.1, label=paste(size,"%"), angle=angle, hjust=hjust), size = 2) +
geom_text(data = percent, aes(x=x, y=y, label=name_percent, angle=pangle), size=2)
plot increase_rate contains abs(increase_rate) > 0.05.
but under the code, discard the data between -0.05 to 0.05.
I also plot data including from -0.05 to 0.05 range.
library(tidyverse)
data(population, package="tidyr")
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
filter(abs(increase_rate) > 0.05) %>%
ungroup %>%
ggplot()+
geom_line(aes(x = year, y = increase_rate, color = country))
I want to get final plot like this.
d <-
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
ungroup
select_country <-
d %>% filter(!between(increase_rate, -0.05, 0.05)) %>%
select(country) %>% distinct %>% unlist
d %>%
filter(country %in% select_country) %>%
ggplot()+
geom_line(aes(x = year, y = increase_rate, color = country))
use between:
filter(!between(increase_rate, -0.05, 0.05))
add column using mutate() function
population %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(population_nextY = lead(population)) %>%
mutate(increase_rate = (population_nextY - population)/population_nextY) %>%
mutate(judge = max(abs(increase_rate), na.rm=T)) %>%
filter(judge > 0.05) %>%
ungroup %>%
ggplot() +
geom_line(aes(x = year, y = increase_rate, color = country))