Setting the order level when using barplots - r

I'm trying to plot a series of demographics factors. Each plot show the frequency distributions of demographic variables by gender. It runs nicely, but some of the labels are ordered in alphabetic order and not in meaningful order eg. Education, Marital Status and SIC2007.
Data structure
structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980,
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1,
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1,
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1,
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1,
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1,
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1,
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1),
SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10,
1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1,
-1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1,
87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3,
3), dmarsta = c("Single, never married", "Single, never married",
"Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary",
" A level or equivalent", "Item not applicable", "Degree or higher"
), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0,
0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1,
11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1),
dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable",
"Public admin, education and health", "Item not applicable",
"Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30",
"15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000",
"Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours",
"Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
#Age variable
demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",
ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
age<- "zombie")))))
demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)
#Gender variable
demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"
Gender<-as.vector(demographics$DMSex)
#Income variable
demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
ifelse(demographics$Income > 3001 & demographics$Income < 6001, income <-"3001 to 6000",
ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
income<- "zombie"))))
demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]
Income<-as.vector(demographics$Income_category)
#Marital status variable
demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"
MaritalStatus<-as.vector(demographics$dmarsta)
#Education
demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"
Education<-as.vector(demographics$dhiqual)
#Hours worked per week in main job variable
demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",
ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
workhours<- "Not Applicable"))))
demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]
WorkHours<-as.vector(demographics$HoursWorked_category)
#DV: SIC 2007 industry divisions (grouped)
demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"
demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
"Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
"Public admin, education and health", "Other service")
SIC2007<-as.vector(demographics$dsic)
# creating df
df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)
df %>%
#tidy, not gender
gather(variable, value, -c(Gender))%>%
#group by value, variable, then gender
group_by(value, variable, Gender) %>%
#summarise to obtain table cell frequencies
summarise(freq=n()) %>%
#Plot
ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender), stat='identity', position='dodge')+ facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies")

In ggplot2, the data is ordered according to the factor levels of the data.frame column.
To (re)set the order in your plot, just set the order of the factor by:
df$variable <- factor(df$variable, levels = c(...))
You could do this by first storing the data.frame, before piping to the ggplot function, then manually setting the levels of the variables you want to change. It is maybe a bit inefficient, but this should do the trick:
## Make your plotting data.frame
df2 <- df %>%
gather(variable, value, -c(Gender))%>%
group_by(value, variable, Gender) %>%
summarise(freq=n())
## Apply custom order to MaritalStatus variable:
custom <- c(sort(unique(MaritalStatus))[c(4,3,1,2)],
....)
df2$variable <- factor(df2$variable, levels = c(levels(df2$variable)[!levels(df2$variable) %in% custom],
custom))

Related

Get row columns by group for geom_col in ggplot

I am trying to calculate row percentages by demographics of various score levels--in my data, that would be what % of white people (or % of black people, or % male, or % who have education level 2, and so on) have a score of 0 (or 1, 2, or 3)--and then use that to create a big plot.
So in my example data below, 8.33% of race == 1 (which is white) have a score of 0, 25% have a score of 1, 25% have a score of 2, and 41.67% have a score of 3.
Then the ultimate end goal would be to get some type of bar plot where the 4 levels of 'score' are across the x axis, and the various comparisons of demographics run down the y axis. Something that looks visually sort of like this, but with the levels of 'score' across the top instead of education levels: .
I already have some code to make the actual figure, which I've done in other instances but with externally/already-calculated percentages:
ggplot(data, aes(x = percent, y = category, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group~score_var,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
I am just struggling to figure out the best way to calculate these row percentages, presumably using group_by() and summarize and then storing or configuring them in a way that they can be plotted. Thank you.
d <- structure(list(race = c(1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1,
1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2,
3, 3), gender = c(0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1
), education = c(1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3,
4, 1, 3, 1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3, 4, 1, 3
), score = c(1, 2, 2, 1, 2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 3, 1, 3, 3, 0, 1, 2, 2, 0)), row.names = c(NA,
-36L), spec = structure(list(cols = list(race = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), score = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000001bd978b0df0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
This may get you started:
library(dplyr)
library(ggplot2)
prop <- data %>%
mutate(race = factor(race, levels = c(1, 2, 3), labels = c("White", "Black", "Others"))) %>%
group_by(race) %>%
mutate(race_n = n()) %>%
group_by(race, score) %>%
summarise(percent = round(100*n()/race_n[1], 1))
prop %>%
ggplot(aes(x = percent, y = score, fill = race)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(~race) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
Edit
To put all characters together, you can get a bigger graph:
df <- data %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "character", values_to = "levels")
df %>% group_by(character, levels) %>%
mutate(group_n = n()) %>%
group_by(character, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1)) %>%
ggplot(aes(x = percent, y = score, fill = character)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(character ~ levels) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
please note: I have changed the code for gender.
Taking inspiration from #Zhiqiang Wang's excellent first pass at this, I finally figured out a solution. I still need to change the order of the labels (to put the education levels in order, and move the race variables to the top of the figure) but this is basically what I was envisioning.
d_test <- d %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "group", values_to = "levels")
d_test <- d_test %>% group_by(group, levels) %>%
mutate(group_n = n()) %>%
group_by(group, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1))
d_test <- d_test %>%
mutate(var = case_when(group == "gender" & levels == 1 ~ "female",
group == "gender" & levels == 2 ~ "male",
group == "race" & levels == 1 ~ "white",
group == "race" & levels == 2 ~ "black",
group == "race" & levels == 3 ~ "hispanic",
group == "education" & levels == 1 ~ "dropout HS",
group == "education" & levels == 2 ~ "grad HS",
group == "education" & levels == 3 ~ "some coll",
group == "education" & levels == 4 ~ "grad coll"))
ggplot(d_test, aes(x = percent, y = var, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group ~ score,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'",
y = "",
x = "Percent") +
theme_minimal() +
theme(legend.position = "none",
strip.text.y = element_blank())

How to use ggplot with Haven dataset

New to coding and R but have a STATA dataset, I want to use ggplot for visulations of my data however, I get multiple errors such as
no applicable method for 'rescale' applied to an object of class "c('haven_labelled', 'vctrs_vctr', 'double')"
I dont know how to convert them so I can plot them for visualisations,
the lines of code are as followed:
Data <- read_dta("longitudinal_td.dta")
Data <- Data %>%
select(pidp,wave,age_dv,sex_dv,ethn_dv,sf1_dv,bmi_dv,sf12pcs_dv,fihhmnnet1_dv,sf12mcs_dv) %>%
filter(wave == "1", age_dv<=50)%>%
mutate(pipd = row_number(),age=age_dv, sex=sex_dv, ethnicity = ethn_dv, general_health=sf1_dv,
bmi=bmi_dv, physical_component_score=sf12pcs_dv, mental_component_score=sf12mcs_dv, household_income=fihhmnnet1_dv)%>%
select(-pipd,-age_dv,-sex_dv,-ethn_dv,-sf1_dv,-bmi_dv,-sf12pcs_dv,-sf12mcs_dv,-fihhmnnet1_dv)
I hope this is correct, here is the dput:
Essentially im just trying to explore BMI but i dont know if I can just plot these or have to assign the numbers to a label like it already is done in haven labels
dput(head(Data))
structure(list(pidp = structure(c(68001367, 68006127, 68008167,
68009527, 68010207, 68010887), label = "cross-wave person identifier (public release)", format.stata = "%12.0g"),
wave = structure(c(1, 1, 1, 1, 1, 1), label = "interview wave", format.stata = "%8.0g"),
age = structure(c(39, 39, 38, 31, 24, 45), label = "Age, derived from dob_dv and intdat_dv", format.stata = "%8.0g"),
sex = structure(c(1, 2, 2, 1, 2, 2), label = "Sex, derived", format.stata = "%8.0g", labels = c(Male = 1,
Female = 2), class = c("haven_labelled", "vctrs_vctr", "double"
)), ethnicity = structure(c(1, 1, 1, 1, 1, 1), label = "Ethnic group (derived from multiple sources)", format.stata = "%8.0g", labels = c(`white uk` = 1,
irish = 2, `gypsy or irish traveller` = 3, `any other white background` = 4,
`white and black caribbean` = 5, `white and black african` = 6,
`white and asian` = 7, `any other mixed background` = 8,
indian = 9, pakistani = 10, bangladeshi = 11, chinese = 12,
`any other asian background` = 13, caribbean = 14, african = 15,
`any other black background` = 16, arab = 17, `any other ethnic group` = 97
), class = c("haven_labelled", "vctrs_vctr", "double")),
general_health = structure(c(2, 4, 5, 3, 1, 1), label = "General health", format.stata = "%8.0g", labels = c(excellent = 1,
`very good` = 2, good = 3, fair = 4, `or Poor?` = 5), class = c("haven_labelled",
"vctrs_vctr", "double")), bmi = structure(c(29.6, 38.8, 21.5,
24.2, 25, 25.5), label = "Body Mass Index", format.stata = "%12.0g")
Thanks for posting an example of your data with dput(). The format of the data you have posted suggests that it has somehow become a list rather than a data frame. You need to convert it to a data frame - as you're using haven I would stick with the tidyverse and do it with as_tibble().
Similarly, you want the labels rather than the underlying integers. You can simply apply as_factor to the whole data frame to do this.
Your data is then ready to be piped to ggplot2. For example:
library(dplyr)
library(ggplot2)
library(haven)
Data |>
as_tibble() |>
as_factor() |>
ggplot() +
geom_boxplot(aes(x=sex, y=bmi))

How do I write a function to plot a line graph for each factor in a dataframe?

I have a dataframe, the head of which looks like this:
|trackName | week| sum|
|:--------------------|----:|---:|
|New Slang | 1| 493|
|You're Somebody Else | 1| 300|
|Mushaboom | 1| 297|
|San Luis | 1| 296|
I am interested in plotting a line graph for each of the 346 unique trackNames in the dataframe, with week on the x-axis and sum on the y-axis. To automate this process, I wrote the following function:
charts <- function(df) {
songs <- df
lim <- nrow(songs)
x <- 1
song_names <- as_tibble(unique(songs$trackName))
while (x <= lim) {
song <- song_names[x, 1]
plot.name <- paste(paste(song), "plot.png", sep = "_")
songs %>% filter(trackName == paste(song[x, 1])) %>%
ggplot(., aes(x = week, y = sum), group = 1) +
geom_line() +
labs(
x = "Week",
y = "Sum of Listens",
title = paste("Week by Week Listening Interest for", song, sep = " "),
subtitle = "Calculated by plotting the sum of percentages of the song listened per week, starting from first listen"
) +
ggsave(plot.name,
width = 20,
height = 15,
units = "cm")
x <- x + 1
}
}
However when I run charts(df), only the following error shows up and then it quits:
> charts(mini)
geom_path: Each group consists of only one observation. Do you need to
adjust the group aesthetic?
>
What am I doing wrong here and what does this error mean?
A sample of the dataframe in DPUT format:
structure(list(trackName = c("New Slang", "You're Somebody Else",
"Mushaboom", "San Luis", "The Trapeze Swinger", "Flightless Bird, American Mouth",
"tere bina - Acoustic", "Only for a Moment", "Upward Over the Mountain",
"Virginia May", "Never to Be Forgotten Kinda Year", "Little Talks",
"Jhak Maar Ke", "Big Rock Candy Mountain", "Sofia", "Aaoge Tum Kabhi",
"Deathcab", "Dil Mere", "Choke", "Phir Le Aya Dil", "Lucille",
"tere bina - Acoustic", "Dil Mere", "Only for a Moment", "This Is The Life",
"San Luis", "Main Bola Hey!", "Choo Lo", "Yeh Zindagi Hai", "Aaftaab",
"Never to Be Forgotten Kinda Year", "Khudi", "Flightless Bird, American Mouth",
"Mere Bina", "Simple Song", "Dil Haare", "Dil Hi Toh Hai", "You're Somebody Else",
"Sofia", "Who's Laughing Now", "Main Bola Hey!", "Lucille", "Eenie Meenie",
"tere bina - Acoustic", "New Slang", "Aaftaab", "Mamma Mia",
"July", "Yeh Zindagi Hai", "Someone You Loved"), week = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3), sum = c(493, 300, 297, 296, 292, 234, 214,
200, 200, 197, 192, 187, 185, 181, 175, 172, 141, 119, 106, 103,
579, 574, 501, 462, 428, 378, 320, 307, 306, 301, 301, 300, 300,
300, 300, 300, 296, 294, 251, 242, 3534, 724, 696, 512, 479,
400, 302, 300, 300, 300)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
How about using purrr::walk instead?
library(tidyverse)
library(hrbrthemes)
walk(unique(songs$trackName),
~{ggsave(plot = ggplot(filter(songs, trackName == .x), aes(x = week, y = sum), group = 1) +
geom_line(color = ft_cols$yellow) +
labs(x = "Week", y = "Sum of Listens", title = paste("Week by Week Listening Interest for", .x, sep = " "),
subtitle = "Calculated by plotting the sum of percentages of the song listened per week, starting from first listen") +
theme_ft_rc(),
file = paste0(.x,"_plot.png"), width = 20, height = 15, units = "cm")})
Note: the question was subsequently edited to remove the hrbrthemes package requirement.
You can split the dataset for each trackName and create a png file for it.
library(tidyverse)
charts <- function(df) {
df %>%
group_split(trackName) %>%
map(~{
track <- first(.x$trackName)
ggplot(.x, aes(x = factor(week), y = sum, group = 1)) +
geom_line() +
labs(
x = "Week",
y = "Sum of Listens",
title = paste("Week by Week Listening Interest for", track),
subtitle = "Calculated by plotting the sum of percentages of the song listened per week, starting from first listen"
) -> plt
ggsave(paste0(track,'.png'), plt, width = 20, height = 15, units = "cm")
})
}
charts(songs)

Struggling to find the total number of rows that meet a certain variable grouped by another variable

I'm performing some light analysis on an NFL kickers' dataset, and am trying to find the total number of kicks made from 18-29yds grouped by each kicker. The dataset's rows contain every made or missed field goal for each kicker, along with the distance and some other variables irrelevant to this issue. I'm using groupby() and then the sum function within the summarise function, but it is returning 1 for every kicker. I have tried different combinations, trying to use filter() as well, but the results keep returning 1 for each kicker. Pics of my code are attached. Any help is appreciated :)
Some code I have tried:
kicks20to29 <- nfl_kicks1%>%
group_by(Kicker)%>%
count(filter(nfl_kicks1$`FG Length`>=18 & nfl_kicks1$`FG Length`<=29))
kicks20to29 <- nfl_kicks1%>%
group_by(Kicker)%>%
filter(`FG Length`>=18 & `FG Length`<=29)
dput output:
structure(list(Quarter = c(1, 2, 1, 2, 2, 4), `Possession Team` = c("NE",
"NE", "NE", "NE", "NE", "NE"), `Wind Speed` = c("6", "6", "12",
"12", "12", "12"), Down = c(4, 4, 4, 4, 4, 4), Distance = c(13,
7, 2, 6, 9, 12), YardLine = c(22, 20, 2, 6, 35, 25), `FG Length` = c(39,
37, 19, 23, 52, 42), `4Q to tie or take lead` = c(0, 0, 0, 0,
0, 0), Result = c("Miss", "Miss", "Good", "Good", "Good", "Miss"
), `Success Rate` = c(0, 0, 1, 1, 1, 0), Kicker = c("A.Vinatieri",
"A.Vinatieri", "A.Vinatieri", "A.Vinatieri", "A.Vinatieri", "A.Vinatieri"
), `# career kicks in study` = c(766, 766, 766, 766, 766, 766
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
One approach is to use the tally function, which counts the number of rows per group.
library(tidyverse)
nfl_kicks1 %>%
group_by(Kicker) %>%
dplyr::filter(`FG Length` >= 18 & `FG Length` <= 29) %>%
tally(name = "Number of Kicks")
## A tibble: 1 x 2
# Kicker `Number of Kicks`
# <chr> <int>
#1 A.Vinatieri 2
You can use group_by + summarise :
library(dplyr)
nfl_kicks1 %>%
group_by(Kicker) %>%
summarise(n_kicks = sum(`FG Length` >= 18 & `FG Length` <= 29))

Selectively apply custom function based on criteria

I am working with this dataframe:
structure(list(year = c("2012", "2016", "2012", "2016"), month = c("12",
"12", "12", "12"), company = c("ALSN", "ALSN", "DAN", "DAN"),
Revenue = c(2141.8, 1840.2, 7224, 5826), `Cost of Goods Sold` = c(1187.5,
976, 6250, 4982), `Gross Profit` = c(954.3, 864.2, 974, 844
), `Gross Margin %` = c(44.56, 46.96, 13.48, 14.49), `Selling, General, & Admin. Expense` = c(419,
323.9, 424, 406), `Impairment Of Capital Assets` = c(0, 0,
2, 0), Advertising = c(1, 1, 1, 1), `Research & Development` = c(115.1,
88.8, 0, 0), `Restructuring And Mergern Acquisition` = c(0,
0, 47, 0), `Other Operating Expense` = c(-5.68434188608e-14,
1.13686837722e-13, 121, 8), `Operating Income` = c(420.2,
451.5, 429, 430), `Operating Margin %` = c(19.62, 24.54,
5.94, 7.38), `Interest Income` = c(0.9, 0.7, 24, 13), `Interest Expense` = c(-152.1,
-101.6, -84, -113), `Net Interest Income` = c(-151.2, -100.9,
-60, -100), `Other Income (Expense)` = c(-52.8, -9.3, -5,
-115), `Non Operating Income` = c(-52.8, -9.3, -5, -115),
`Other Income (Minority Interest)` = c(0, 0, -15, -13), `Gain on Sale of Security` = c(-1.3,
-0.8, 0, 7), `Write Off` = c(1, 1, 1, 1), `Pre-Tax Income` = c(216.2,
341.3, 364, 215), `Tax Provision` = c(298, -126.4, -51, 424
), `Tax Rate %` = c(-137.84, 37.03, 14.01, -197.21), `Net Income (Continuing Operations)` = c(514.2,
214.9, 315, 653), `Net Income (Discontinued Operations)` = c(0,
0, 0, 0), `Net Income` = c(514.2, 214.9, 300, 640), `Net Margin %` = c(24.01,
11.68, 4.15, 10.99), `Preferred Dividends` = c(0, 0, 31,
0), `EPS (Basic)` = c(2.83, 1.28, 1.82, 4.38), `EPS (Diluted)` = c(2.76,
1.27, 1.4, 4.36), `Shares Outstanding (Diluted Average)` = c(186.2,
168.8, 214.7, 146.8), `Depreciation, Depletion and Amortization` = c(252.5,
175.9, 277, 182), EBITDA = c(620.8, 618.8, 725, 510)), .Names = c("year",
"month", "company", "Revenue", "Cost of Goods Sold", "Gross Profit",
"Gross Margin %", "Selling, General, & Admin. Expense", "Impairment Of Capital Assets",
"Advertising", "Research & Development", "Restructuring And Mergern Acquisition",
"Other Operating Expense", "Operating Income", "Operating Margin %",
"Interest Income", "Interest Expense", "Net Interest Income",
"Other Income (Expense)", "Non Operating Income", "Other Income (Minority Interest)",
"Gain on Sale of Security", "Write Off", "Pre-Tax Income", "Tax Provision",
"Tax Rate %", "Net Income (Continuing Operations)", "Net Income (Discontinued Operations)",
"Net Income", "Net Margin %", "Preferred Dividends", "EPS (Basic)",
"EPS (Diluted)", "Shares Outstanding (Diluted Average)", "Depreciation, Depletion and Amortization",
"EBITDA"), row.names = c(NA, 4L), class = "data.frame")
Constants:
startDate <- "2012-01-01"
endDate <- "2016-12-31"
What I want: to create a function that applies a custom function to all numeric columns. I am trying to calculate CAGRs. The CAGR formula is as such:
((End Value / Beginning Value)^(1/number of years)-1)
So as you can see, I need for each column to be able to find the correct end value and beginning value.
My function right now is this:
cagr <- function(startval,endval,x,y,years){
return(((endval[x == year(endDate)]/startval[y == year(startDate)])^(1/(years-1)))-1)
}
cagrNew <- function(df,colum,x,y,years){
colum <- quo(colum)
x <- quo(x)
y <- quo(y)
out <- df %>%
group_by(!!company) %>%
summarise(xxxx = cagr(!!colum[!!x == year(endDate)],!!colum[!!y == year(startDate)],!!x,!!y,numYears))
return(out)
}
When I run the above function (cagrNEW), I get this error:
Error in `[.formula`(colum, !(!x == year(endDate))) :
attempt to set an attribute on NULL
My desired output:
Company RevenueCagr Cost of Goods Sold CAGR ....
ALSN .5% .3%
DAN .3% .2%
I haven't repeated the data above, to conserve space. Convert to tibble and assign.
# df <- as_tibble(...)
library(tidyverse)
library(scales) #< For percentage formatting
start_year <- 2012
end_year <- 2016
df %>%
filter(year %in% c(start_year, end_year)) %>%
group_by(company) %>%
arrange(desc(year), .by_group = TRUE) %>%
summarise_if(is.double, funs(CAGR = percent( (.[[1]]/.[[2]])^ (1/(end_year - start_year)) - 1) ) )
# CAGR = ((End Value / Beginning Value)^(1/number of years)-1)
#Checksum: ALSN Company, Revenue
# (End Value / Beginning Value)^((1/number of years))-1
percent(( (1840 / 2142) ^ (1/(2016-2012)) - 1))
#> [1] "-3.73%"

Resources