Getting Cumulative Sum Over Time - r

I have data with goals scored for each player each season:
playerID <- c(1,2,3,1,2,3,1,2,3,1,2,3)
year <- c(2002,2000,2000,2003,2001,2001,2000,2002,2002,2001,2003,2003)
goals <- c(25,21,27,31,39,34,42,44,46,59,55,53)
my_data <- data.frame(playerID, year, goals)
I would like to plot each player's cumulative number of goals over time:
ggplot(my_data, aes(x=year, y=cumsum_goals, group=playerID)) + geom_line()
I have tried using summarize from dplyr, but this only works if the data is already sorted by year (see player 1):
new_data <- my_data %>%
group_by(playerID) %>%
mutate(cumsum_goals=cumsum(goals))
Is there a way to make this code robust to data where years are not in chronological order?

We can arrange by playerID and year, take cumsum and then plot
library(dplyr)
library(ggplot2)
my_data %>%
arrange(playerID, year) %>%
group_by(playerID) %>%
mutate(cumsum_goals=cumsum(goals)) %>%
ggplot() + aes(x=year, y= cumsum_goals, color = factor(playerID)) + geom_line()

Related

Making the X_axis more visible?

This is the code I used, the goal is to visualize the evolution of covid in north africa
library(readr)
library(ggplot2)
library(dplyr)
covid <- read.csv("owid-covid-data.csv")
covid
covid %>%
filter(location %in% c("Tunisia", "Morocco", "Libya")) %>%
ggplot(aes(x = date, y= new_cases,color = location, group = location)) +
geom_line()
This is the dataset I used
as you can see the X_axis is day-to-day therefore it's a bit condensed dataset
And this is the plot
you can't see anything in the X_axis, I want to be able to discern the dates maybe use weeks or months to scale instead of days plot.
r
I converted string columns to date type as the comments suggested and it all worked out
library(readr)
library(ggplot2)
library(dplyr)
covid <- read.csv("owid-covid-data.csv")
covid
covid %>%
filter(location %in% c("Tunisia", "Morocco", "Libya")) %>%
mutate(date = as.Date(date))%>%
ggplot(aes(x = date, y= new_cases,color = location, group = location)) +
geom_line()
this is the plot after modification.

Ggplot - How to present the mean of a third varience?

Let's say I have this data frame:
The data frame
I want to make a graph which presents for each SES (Social Economy Status) what is the mean income for females and what is the mean income for males.
I have so far this code:
ggplot(incomeSorted, aes(GENDER)) +
scale_y_continuous("Mean")+
geom_bar(position = "dodge")+
facet_wrap("SES")
and this is the output:
How do I make the graph to present the mean of income instead of counting the number of females and males at each category?
Thanks ahead!
If you want to display mean income, you have to compute it. You can use dplyr and group_by() with summarise() to obtain the key variable and then plot. Here a code for the task:
library(ggplot2)
library(dplyr)
#Data
df <- data.frame(id=1:8,Gender=c(rep('Female',4),rep(c('Male','Female'),2)),
income=c(73,150,220.18,234,314.16,983.1,1001,1012),
SES=c('Bottom','Bottom','Middle','Middle','Middle',
'Upper','Upper','Upper'),
stringsAsFactors = F)
#Compute and plot
df %>% group_by(SES,Gender) %>%
summarise(MeanIncome=mean(income,na.rm=T)) %>%
ggplot(aes(x=Gender,y=MeanIncome)) +
scale_y_continuous("Mean")+
geom_bar(stat = 'identity')+
facet_wrap(.~SES)
Output:
Or you can avoid facets and displaying the plot with a fill variable like this:
#Code 2
df %>% group_by(SES,Gender) %>%
summarise(MeanIncome=mean(income,na.rm=T)) %>%
ggplot(aes(x=Gender,y=MeanIncome,fill=SES)) +
scale_y_continuous("Mean")+
geom_bar(stat = 'identity',position = position_dodge2(0.9,preserve = 'single'))
Output:

Plotting different series with different lengths quantmod

I want to plot cumsum line for each year of a series, but the thing is each year have different observations. I have tried using Chart_series but it doesn't work.
My goal is to plotting lines as years in terms of cumulative evolution.
In the example I put only two years.
library(PerformanceAnalytics)
library(quantmod)
library(tidyverse)
library(tidyquant)
library(xts)
a<-dailyReturn(TSLA,subset='2020')
a2019<-dailyReturn(TSLA,subset='2019')
b<-cumsum(a)*100
b2019<-cumsum(a2019)*100
plot(b2019)
lines(b)
We could get a single dataset and then do a group by cumsum before plotting
library(dplyr)
library(tibble)
library(lubridate)
library(PerformanceAnalytics)
library(quantmod)
library(ggplot2)
getSymbols('TSLA')
dailyReturn(TSLA, subset = c('2019', '2020')) %>%
as.data.frame %>%
rownames_to_column('Date') %>%
mutate(Date = as.Date(Date)) %>%
group_by(Year = year(Date)) %>%
mutate(CumDaily.returns = cumsum(daily.returns) * 100) %>%
ggplot(aes(x = Date, y = CumDaily.returns, color = Year)) +
geom_line() +
theme_bw()
-output

Using geom_smooth for fitting a glm to fractions

This post is somewhat related to this post.
Here I have xy grouped data where y are fractions:
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(age)) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
What I'm trying to achieve is to plot df1 as a bar plot, like this:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity')
But I want to fit to each group a binomial glm with a logit link function, which estimates how these fractions are affected by time.
Let's say I have 100 observations per each age (time) in each group:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(T,100*df1$value[i]),rep(F,100*(1-df1$value[i]))))
}))
Then the glm for each group (e.g., group A) is:
glm(value ~ time, dplyr::filter(df2, group == "A"), family = binomial(link='logit'))
So I would like to add to the plot above the estimated regression slopes for each group along with their corresponding p-values (similar to what I'm doing for the continuous df$value in this post).
I thought that using:
ggplot(df1,aes(x=group_age,y=value,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=group_age,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit')),size=1,se=T) +
stat_poly_eq(aes(label=stat(p.value.label)),formula=my_formula,parse=T,npcx="center",npcy="bottom") +
scale_x_log10(name="Age",labels=levels(df$age),breaks=1:length(levels(df$age))) +
facet_wrap(~group) + theme_minimal()
Would work but I get the error:
Error in Math.factor(x, base) : ‘log’ not meaningful for factors
Any idea how to get it right?
I believe this could help:
library(tidyverse)
library(broom)
df2$value <- as.numeric(df2$value)
#Estimate coefs
dfmodel <- df2 %>% group_by(group) %>%
do(fitmodel = glm(value ~ time, data = .,family = binomial(link='logit')))
#Extract coeffs
dfCoef = tidy(dfmodel, fitmodel)
#Create labels
dfCoef %>% filter(term=='(Intercept)') %>% mutate(Label=paste0(round(estimate,3),'(p=',round(p.value,3),')'),
group_age=paste0(group,'_','d4')) %>%
select(c(group,Label,group_age)) -> Labels
#Values
df2 %>% group_by(group,group_age) %>% summarise(value=sum(value)) %>% ungroup() %>%
group_by(group) %>% filter(value==max(value)) %>% select(-group_age) -> values
#Combine
Labels %>% left_join(values) -> Labels
Labels %>% mutate(age=NA) -> Labels
#Plot
ggplot(df2,aes(x=group_age,y=value,fill=age)) +
geom_text(data=Labels,aes(x=group_age,y=value,label=Label),fontface='bold')+
geom_bar(stat='identity')+
facet_wrap(.~group,scales='free')
Thanks to Pedro Aphalo this is nearly a complete solution:
Generate the data.frame with the fractions (here use time as an integer by deleting "d" in age rather than using time as the levels of age):
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
age = rep(c("d2","d4","d45"),3),
group = c("A","A","A","B","B","B","C","C","C")) %>%
dplyr::mutate(time = as.integer(gsub("d","",age))) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))
Inflate df1 to 100 observations per each age in each group but specify value as an integer rather than a binary:
df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(1,100*df1$value[i]),rep(0,100*(1-df1$value[i]))))
}))
And now plot it using geom_smooth and stat_fit_tidy:
ggplot(df1,aes(x=time,y=value,group=group,fill=age)) +
geom_bar(stat='identity') +
geom_smooth(data=df2,mapping=aes(x=time,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit'))) +
stat_fit_tidy(data=df2,mapping=aes(x=time,y=value,group=group,label=sprintf("P = %.3g",stat(x_p.value))),method='glm',method.args=list(formula=y~x,family=binomial(link='logit')),parse=T,label.x="center",label.y="top") +
scale_x_log10(name="Age",labels=levels(df2$age),breaks=unique(df2$time)) +
facet_wrap(~group) + theme_minimal()
Which gives (note that the scale_x_log10 is mainly a cosmetic approach to presenting the x-axis as time rather than levels of age):
The only imperfection is that the p-values seem to appear messed up.

ggplot2 overlayed line chart by year?

Starting with the following dataset:
$ Orders,Year,Date
1608052.2,2019,2019-08-02
1385858.4,2018,2018-07-27
1223593.3,2019,2019-07-25
1200356.5,2018,2018-01-20
1198226.3,2019,2019-07-15
837866.1,2019,2019-07-02
Trying to make a similar format as:
with the criteria: X-axis will be days or months, y-axis will be sum of Orders, grouping / colors will be by year.
Attempts:
1) No overlay
dataset %>%
ggplot( aes(x=`Merge Date`, y=`$ Orders`, group=`Merge Date (Year)`, color=`Merge Date (Year)`)) +
geom_line()
2) ggplot month grouping
dataset %>%
mutate(Date = as.Date(`Date`) %>%
mutate(Year = format(Date,'%Y')) %>%
mutate(Month = format(Date,'%b')) -> dataset2
ggplot(data=dataset2, aes(x=Month, y=`$ Orders`, group=Year, color=factor(Year))) +
geom_line(size=.75) +
ylab("Volume")
The lubridate package is your answer. Extract month from the Date field and turn it into a variable. This code worked for me:
library(tidyverse)
library(lubridate)
dataset <- read_delim("OrderValue,Year,Date\n1608052.2,2019,2019-08-02\n1385858.4,2018,2018-07-27\n1223593.3,2019,2019-07-25\n1200356.5,2018,2018-01-20\n1198226.3,2019,2019-07-15\n837866.1,2019,2019-07-02", delim = ",")
dataset <- dataset %>%
mutate(theMonth = month(Date))
ggplot(dataset, aes(x = as.factor(theMonth), y = OrderValue, group = as.factor(Year), color = as.factor(Year))) +
geom_line()

Resources