Case_when adding on a function - r

I'm trying to write a function in R that generates three values that loops over each row of a df, checks the value of one column then, if it meets this condition, adds the value of the value of one column for this row to a value.
I thought that using the case_when construction would work best for this, but should I be using an lapply constructon instead ?
get_home_away_goals_for_team <- function(matches_df, team_list){
complete_df <- data.frame(team = character(), goals_scored= double(),goals_conceded = double(), games_played = double())
for (team in teams){
print(team)
goals_scored <- 0
goals_conceded <- 0
games_played <- 0
case_when(
matches_df$home == team ~ goals_scored = goals_scored + matches_df$hg,
matches_df$home == team ~ goals_conceded = goals_conceded + matches_df$ag,
matches_df$away == team ~ goals_scored = goals_scored + matches_df$ag,
matches_df$away == team ~ goals_conceded = goals_conceded + matches_df$hg,
matches_df$home == team ~ games_played = games_played + 1,
matches_df$away == team ~ games_played = games_played + 1)
temp_get_goals_df = data.frame(team,goals_scored,goals_conceded,games_played)
complete_df <- rbind(complete_df,temp_get_goals_df)
}
complete_df
}
The function takes a value of team, checks for each row whether this team was playing in a game home or away, then adds to the values of goals scored accordingly.
When I try to use the function though, I get the error Error: unexpected '}' in " }" which makes me think I'm using case_when incorrectly.
Is this the case ?
Data:
matches_df_example :
structure(list(home = c("Colorado Rapids", "Vancouver Whitecaps",
"DC United", "Los Angeles Galaxy", "San Jose Earthquakes", "FC Dallas"
), away = c("Columbus Crew", "Club de Foot Montreal", "Sporting Kansas City",
"Real Salt Lake", "New England Revolution", "New York Red Bulls"
), res = c("H", "H", "A", "A", "H", "H"), season = c(2012, 2012,
2012, 2012, 2012, 2012), hg = c(2, 2, 0, 1, 1, 2), ag = c(0,
0, 1, 3, 0, 1), date_time = structure(c(1331420400, 1331420400,
1331425800, 1331436600, 1331436600, 1331492400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), home_conference = c("West", "West", "East", "West",
"West", "West"), away_conference = c("East", "East", "East",
"West", "East", "East")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
teams
c("Colorado Rapids", "Vancouver Whitecaps", "DC United", "Los Angeles Galaxy",
"San Jose Earthquakes", "FC Dallas", "Chivas USA", "Portland Timbers",
"Club de Foot Montreal", "Sporting Kansas City", "Real Salt Lake",
"Seattle Sounders", "Philadelphia Union", "Toronto FC", "Columbus Crew",
"New England Revolution", "Chicago Fire", "New York Red Bulls",
"Houston Dynamo", "Orlando City", "New York City", "Atlanta United",
"Minnesota United", "Los Angeles FC", "FC Cincinnati", "Atlanta Utd",
"Nashville SC", "Inter Miami", "Austin FC")

Perhaps this?
homes <- matches_df %>%
group_by(team = home) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
aways <- matches_df %>%
group_by(team = away) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
full_join(homes, aways, by = "team", suffix = c("", ".y")) %>%
full_join(tibble(team = teams), by = "team") %>%
transmute(
team,
goals_scored = coalesce(goals_scored, goals_scored.y, 0),
goals_conceded = coalesce(goals_conceded, goals_conceded.y, 0),
games_played = coalesce(games_played, games_played.y, 0)
)
# # A tibble: 29 x 4
# team goals_scored goals_conceded games_played
# <chr> <dbl> <dbl> <dbl>
# 1 Colorado Rapids 2 0 1
# 2 DC United 0 1 1
# 3 FC Dallas 2 1 1
# 4 Los Angeles Galaxy 1 3 1
# 5 San Jose Earthquakes 1 0 1
# 6 Vancouver Whitecaps 2 0 1
# 7 Club de Foot Montreal 2 0 1
# 8 Columbus Crew 2 0 1
# 9 New England Revolution 1 0 1
# 10 New York Red Bulls 2 1 1
# # ... with 19 more rows

Related

Trying to calculate the Expected Value of an observation

I have a tibble and am trying to use values from two specific rows (Pinnacle book) to perform a calculation. The values of the calculation will be written to a new column. Here is the output of dput
structure(list(id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), start = structure(c(1676691000, 1676691000, 1676691000, 1676691000,
1676691000, 1676691000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), home = c("San José St Spartans", "San José St Spartans",
"San José St Spartans", "San José St Spartans", "San José St Spartans",
"San José St Spartans"), away = c("New Mexico Lobos", "New Mexico Lobos",
"New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos"
), team = c("San José St Spartans", "New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos"
), price = c(-140, 120, -140, 120, -138, 117), update = c("2023-02-18T00:24:43Z",
"2023-02-18T00:24:43Z", "2023-02-18T00:25:10Z", "2023-02-18T00:25:10Z",
"2023-02-18T00:25:04Z", "2023-02-18T00:25:04Z"), bep = c(0.58333,
0.45455, 0.58333, 0.45455, 0.57983, 0.46083), no_vig = c(-128.33333,
128.33333, -128.33333, 128.33333, -125.82353, 125.82353), no_vig_bep = c(0.56204,
0.43796, 0.56204, 0.43796, 0.55718, 0.44282), win = c(71.43,
120, 71.43, 120, 72.46, 117)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), team = c("New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos",
"San José St Spartans"), .rows = structure(list(2L, 1L,
4L, 3L, 6L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
The following is the calculation
-4.482633 = (.55718 * 71.43) - (.44282 * 100)
The values in the calculation above correspond with the following variables
-4.482633 = Expected Value I am trying to derive
.55718 = "no_vig_bep" of Pinnacle
71.43 = "win" of observation 1
.44282 = 1 - "no_vig_bep" of Pinnacle or the last row
100 = a set amount
I Would then like to calculate the other side of the odds as follow
-2.5796 = (.44282 * 120.00) - (.55718 * 100)
The ultimate goal is to use the values of the Pinnacle book to perform the above calculation against all other books. The EV will be written to a new column.
Included additional id for further clarification
structure(list(id = c("073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70"
), book = c("Bovada", "Pinnacle", "MyBookie.ag", "MyBookie.ag",
"Pinnacle", "Bovada"), home = c("Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos", "Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos"), away = c("Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals", "Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals"), team = c("Western
Michigan Broncos",
"Ball State Cardinals", "Western Michigan Broncos", "Ball State
Cardinals",
"Western Michigan Broncos", "Ball State Cardinals"), price = c(-185,
-143, -142, 100, 108, 140), bep = c(0.64912, 0.58848, 0.58678,
0.5, 0.48077, 0.41667), no_vig = c(-155.78947, -122.40329, -117.35537,
117.35537, 122.40329, 155.78947), no_vig_bep = c(0.60905, 0.55037,
0.53992, 0.46008, 0.44963, 0.39095), win = c(54.05, 69.93, 70.42,
100, 108, 140), EV_1 = c(-15.2155015, -6.47562589999999,
-6.20594459999999,
-10.074, -6.47696000000001, 7.91119999999999)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups =
structure(list(
book = c("Bovada", "MyBookie.ag", "Pinnacle"), .rows = structure(list(
c(1L, 6L), 3:4, c(2L, 5L)), ptype = integer(0), class =
c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE))
Edit1: Maybe purrr can help you here:
library(purrr)
no_vig_pin_list <- df |>
ungroup() |>
split(df$id) |>
map(~.x |> filter(book == "Pinnacle") |> pull(no_vig_bep))
df |>
ungroup() |>
group_split(id) |>
purrr::map2_dfr(no_vig_pin_list, ~ .x |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(.y[1] * win)- ((1-.y[1])*100),
(.y[2] * win)- ((1-.y[2])*100)))) |>
select(EV_1)
A tibble: 8 × 2
# Groups: book [2]
book EV_1
<chr> <dbl>
1 BetUS -5.03
2 BetUS -1.10
3 Pinnacle -4.07
4 Pinnacle -4.07
5 BetUS -4.48
6 BetUS -2.58
7 Pinnacle -3.91
8 Pinnacle -3.91
Maybe this helps, I am not quite sure that I understood what you are trying to achieve.
library(dplyr)
no_vig_pin <- df |>
filter(book == "Pinnacle") |>
pull(no_vig_bep)
df |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(no_vig_pin[1] * win)- ((1-no_vig_pin[1])*100),
(no_vig_pin[2] * win)- ((1-no_vig_pin[2])*100))) |>
select(EV_1)
Output:
# A tibble: 6 × 2
# Groups: book [3]
book EV_1
<chr> <dbl>
1 BetUS -4.48
2 BetUS -2.58
3 Bovada -4.48
4 Bovada -2.58
5 Pinnacle -3.91
6 Pinnacle -3.91

retain only rows and columns that match with a string vector

I have a large DF with certain columns that have a vector of character values as below. The number of columns varies from dataset to dataset as well as the number of character vectors it holds also varies.
ID Country1 Country2 Country3
1 1 Argentina, Japan,USA,Poland, Argentina,USA Pakistan
2 2 Colombia, Mexico,Uruguay,Dutch Mexico,Uruguay Afganisthan
3 3 Argentina, Japan,USA,NA Japan Khazagistan
4 4 Colombia, Mexico,Uruguay,Dutch Colombia, Dutch North Korea
5 5 India, China China Iran
Would like to match them one-to-one with another string vector as below
vals_to_find <-c("Argentina","USA","Mexico")
If, a column/row matches to anyone of the strings passed would like to retain that column and row. Remove duplicates, and finally remove those values that do not match.
the desired output is as follows
ID Countries.found
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
data
dput(df)
structure(list(ID = 1:5, Country1 = c("Argentina, Japan,USA,Poland,",
"Colombia, Mexico,Uruguay,Dutch", "Argentina, Japan,USA,NA",
"Colombia, Mexico,Uruguay,Dutch", "India, China"), Country2 = c("Argentina,USA",
"Mexico,Uruguay", "Japan", "Colombia, Dutch", "China"), Country3 = c("Pakistan",
"Afganisthan", "Khazagistan", "North Korea", "Iran")), class = "data.frame", row.names = c(NA,
-5L))
dput(df_out)
structure(list(ID = 1:4, Countries.found = c("Argentina, USA",
"Mexico", "Argentina, USA", "Mexico")), class = "data.frame", row.names = c(NA,
-4L))
Instead of a each column as a vector, if the file is read as one value per column. Then, was able do it as below
dput(df_out)
structure(list(ID = 1:5, X1 = c("Argentina", "Colombia", "Argentina",
"Colombia", "India"), X2 = c("Japan", "Mexico", "Japan", "Mexico",
"China"), X3 = c("USA", "Uruguay", "USA", "Uruguay", NA), X4 = c("Poland",
"Dutch", NA, "Dutch", NA), X5 = c("Argentina", "Mexico", "Japan",
"Colombia", "China"), X6 = c("USA", "Uruguay", NA, "Dutch", NA
), X7 = c("Pakistan", "Afganisthan", "Khazagistan", "North Korea",
"Iran")), class = "data.frame", row.names = c(NA, -5L))
df_out %>%
dplyr::select(
where(~ !all(is.na(.x)))
) %>%
dplyr::select(c(1, where(~ any(.x %in% vals_to_find)))) %>%
dplyr::mutate(dplyr::across(
tidyselect::starts_with("X"),
~ vals_to_find[match(., vals_to_find)]
)) %>%
tidyr::unite("countries_found", tidyselect::starts_with("X"),
sep = " | ", remove = TRUE, na.rm = TRUE
)
Output
ID countries_found
1 1 Argentina | USA | Argentina | USA
2 2 Mexico | Mexico
3 3 Argentina | USA
4 4 Mexico
unite the "Country" columns, then create a long vector by separating the values into rows, get all distinct values per ID, filter only those who are in vals_to_find, and summarise each countries.found toString.
library(tidyr)
library(dplyr)
df %>%
unite("Country", starts_with("Country"), sep = ",") %>%
separate_rows(Country) %>%
distinct(ID, Country) %>%
filter(Country %in% vals_to_find) %>%
group_by(ID) %>%
summarise(Countries.found = toString(Country))
output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
We may use
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("Country"),
~ str_extract_all(.x, str_c(vals_to_find, collapse = "|")))) %>%
pivot_longer(cols = -ID, names_to = NULL,
values_to = 'Countries.found') %>%
unnest(Countries.found) %>%
distinct %>%
group_by(ID) %>%
summarise(Countries.found = toString(Countries.found))
-output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico

Mutate a new column based on multiple conditions in R

Assuming the following dataset:
df <- structure(list(id = 1:9, city = structure(c(1L, 7L, 2L, 6L, 4L,
9L, 3L, 8L, 5L), .Label = c("bj", "gz", "lz", "nj", "sh", "sz",
"tj", "wh", "xa"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
How could create a new column direction based on conditions:
if city is in list ['bj', 'tj'], then returns north for direction, if in ['sz', 'nj', 'sh'] returns east, if in ['xa', 'lz'] returns west, if in ['wh'] returns center, if in ['gz', 'sz'] returns south.
The expected result will like this:
My code:
df %>%
filter(city %in% c('bj', 'tj')) %>%
mutate(direction = 'north')
Out:
Use case_when :
library(dplyr)
df %>%
mutate(direction = case_when(city %in% c('bj', 'tj') ~ 'north',
city %in% c('sz', 'nj', 'sh') ~ 'east',
city %in% c('xa', 'lz') ~ 'west',
city %in% c('wh') ~ 'center',
city %in% c('gz', 'sz') ~ 'south',
))
# id city direction
#1 1 bj north
#2 2 tj north
#3 3 gz south
#4 4 sz east
#5 5 nj east
#6 6 xa west
#7 7 lz west
#8 8 wh center
#9 9 sh east
You can do it in an easy way using basic R data.frame manipulation:
df$direction <- ""
df[df$city %in% c('bj', 'tj'), "direction"] <- "north"
df[df$city %in% c('sz', 'nj', 'sh'),"direction"] <- "east"
df[df$city %in% c('xa', 'lz'), "direction"] <- "west"
df[df$city %in% c('wh'), "direction"] <- "center"
df[df$city %in% c('gz', 'sz'), "direction"] <- "south"
df
id city direction
1 1 bj north
2 2 tj north
3 3 gz south
4 4 sz south
5 5 nj east
6 6 xa west
7 7 lz west
8 8 wh center
9 9 sh east
Using nested ifelse statements can do the job as well.
df$direction=ifelse(df$city %in% c("bj","tj"), yes = "north",
ifelse(df$city %in% c('sz', 'nj', 'sh'), yes = "east",
ifelse(df$city %in% c("xa", "lz"), yes = "west",
ifelse(df$city %in% c("gz", "sz"), yes = "south", no = "center"))))
You can try stack to create a dictionary first and then match the cities, e.g.,
d <- stack(
list(
north = c("bj", "tj"),
east = c("sz", "nj", "sh"),
west = c("xa", "lz"),
center = "wh",
south = c("gz", "sz")
)
)
df <- transform(
df,
direction = d$ind[match(city,d$values)]
)
which gives
id city direction
1 1 bj north
2 2 tj north
3 3 gz south
4 4 sz east
5 5 nj east
6 6 xa west
7 7 lz west
8 8 wh center
9 9 sh east
I have also tried another sulution with mutate but the error message is the same about the wrong symbol
Here I loaded the dplyr library first.
library(dplyr)
new_soccer_referee %>%
mutate(postion_new = case_when (position %in% c("Right Fullback", "Left Fullback", "Center Back", "Defensive Midfielder") ~ "Defense",
position %in% c("Right Midfielder", "Left Midfielder", "Center Midfielder") ~ "Midfield",
position %in% c("Attacking Midfielder", "Right Winger", "Left Winger", "Center Forward") ~ "Offense",
))

Grouping by Multiple variables and summarizing character frequencies

I am trying to group my dataset by multiple variables and build a frequency table of the number of times a character variable appears. Here is an example data set:
Location State County Job Pet
Ohio Miami Data Dog
Urban Ohio Miami Business Dog, Cat
Urban Ohio Miami Data Cat
Rural Kentucky Clark Data Cat, Fish
City Indiana Shelby Business Dog
Rural Kentucky Clark Data Dog, Fish
Ohio Miami Data Dog, Cat
Urban Ohio Miami Business Dog, Cat
Rural Kentucky Clark Data Fish
City Indiana Shelby Business Cat
I want my output to look like this:
Location State County Job Frequency Pet:Cat Pet:Dog Pet:Fish
Ohio Miami Data 2 1 2 0
Urban Ohio Miami Business 2 2 2 0
Urban Ohio Miami Data 1 1 0 0
Rural Kentucky Clark Data 3 1 1 3
City Indiana Shelby Business 2 1 1 0
I have tried different iterations of the following code, and I get close, but not quite right:
Output<-df%>%group_by(Location, State, County, Job)%>%
dplyr::summarise(
Frequency= dplyr::n(),
Pet:Cat = count(str_match(Pet, "Cat")),
Pet:Dog = count(str_match(Pet, "Dog")),
Pet:Fish = count(str_match(Pet, "Fish")),
)
Any help would be appreciated! Thank you in advance
Try this:
library(dplyr)
library(tidyr)
#Code
new <- df %>%
separate_rows(Pet,sep=',') %>%
mutate(Pet=trimws(Pet)) %>%
group_by(Location,State,County,Job,Pet) %>%
summarise(N=n()) %>%
mutate(Pet=paste0('Pet:',Pet)) %>%
group_by(Location,State,County,Job,.drop = F) %>%
mutate(Freq=n()) %>%
pivot_wider(names_from = Pet,values_from=N,values_fill=0)
Output:
# A tibble: 5 x 8
# Groups: Location, State, County, Job [5]
Location State County Job Freq `Pet:Cat` `Pet:Dog` `Pet:Fish`
<chr> <chr> <chr> <chr> <int> <int> <int> <int>
1 "" Ohio Miami Data 2 1 2 0
2 "City" Indiana Shelby Business 2 1 1 0
3 "Rural" Kentucky Clark Data 3 1 1 3
4 "Urban" Ohio Miami Business 2 2 2 0
5 "Urban" Ohio Miami Data 1 1 0 0
Some data used:
#Data
df <- structure(list(Location = c("", "Urban", "Urban", "Rural", "City",
"Rural", "", "Urban", "Rural", "City"), State = c("Ohio", "Ohio",
"Ohio", "Kentucky", "Indiana", "Kentucky", "Ohio", "Ohio", "Kentucky",
"Indiana"), County = c("Miami", "Miami", "Miami", "Clark", "Shelby",
"Clark", "Miami", "Miami", "Clark", "Shelby"), Job = c("Data",
"Business", "Data", "Data", "Business", "Data", "Data", "Business",
"Data", "Business"), Pet = c("Dog", "Dog, Cat", "Cat", "Cat, Fish",
"Dog", "Dog, Fish", "Dog, Cat", "Dog, Cat", "Fish", "Cat")), row.names = c(NA,
-10L), class = "data.frame")

Combining data with Base R

I currently need to translate my dplyr code into base R code. My dplyr code gives me 3 columns, competitor sex, the olympic season and the number of different sports. The code looks like this:
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
My data structure looks like this.
structure(list(Name = c("A Lamusi", "Juhamatti Tapio Aaltonen",
"Andreea Aanei", "Jamale (Djamel-) Aarrass (Ahrass-)", "Nstor Abad Sanjun",
"Nstor Abad Sanjun"), Sex = c("M", "M", "F", "M", "M", "M"),
Age = c(23L, 28L, 22L, 30L, 23L, 23L), Height = c(170L, 184L,
170L, 187L, 167L, 167L), Weight = c(60, 85, 125, 76, 64,
64), Team = c("China", "Finland", "Romania", "France", "Spain",
"Spain"), NOC = c("CHN", "FIN", "ROU", "FRA", "ESP", "ESP"
), Games = c("2012 Summer", "2014 Winter", "2016 Summer",
"2012 Summer", "2016 Summer", "2016 Summer"), Year = c(2012L,
2014L, 2016L, 2012L, 2016L, 2016L), Season = c("Summer",
"Winter", "Summer", "Summer", "Summer", "Summer"), City = c("London",
"Sochi", "Rio de Janeiro", "London", "Rio de Janeiro", "Rio de Janeiro"
), Sport = c("Judo", "Ice Hockey", "Weightlifting", "Athletics",
"Gymnastics", "Gymnastics"), Event = c("Judo Men's Extra-Lightweight",
"Ice Hockey Men's Ice Hockey", "Weightlifting Women's Super-Heavyweight",
"Athletics Men's 1,500 metres", "Gymnastics Men's Individual All-Around",
"Gymnastics Men's Floor Exercise"), Medal = c(NA, "Bronze",
NA, NA, NA, NA), BMI = c(20.7612456747405, 25.1063327032136,
43.2525951557093, 21.7335354170837, 22.9481157445588, 22.9481157445588
)), .Names = c("Name", "Sex", "Age", "Height", "Weight",
"Team", "NOC", "Games", "Year", "Season", "City", "Sport", "Event",
"Medal", "BMI"), row.names = c(NA, 6L), class = "data.frame")
Does anyone know how to translate this into base R?
Since you are grouping twice in dplyr you can use double aggregate in base R
setNames(aggregate(Name~Sex + Season,
aggregate(Name~Sex + Season + Sport, olympics, length), length),
c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
This gives the same output as dplyr option
library(dplyr)
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
A base R option would be using aggregate twice
out <- aggregate(BMI ~ Sex + Season,
aggregate(BMI ~ Sex + Season + Sport, olympics, length), length)
names(out) <- c("Competitor_Sex", "Olympic_Season", "Num_Sports")
out
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
It is similar to the OP's output
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# A tibble: 3 x 3
# Groups: Sex [2]
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
Or it can be done in a compact way with table from base R
table(sub(",[^,]+$", "", names(table(do.call(paste,
c(olympics[c("Sex", "Season", "Sport")], sep=","))))))
# F,Summer M,Summer M,Winter
# 1 3 1

Resources