Trying to calculate the Expected Value of an observation

Trying to calculate the Expected Value of an observation - r

I have a tibble and am trying to use values from two specific rows (Pinnacle book) to perform a calculation. The values of the calculation will be written to a new column. Here is the output of dput
structure(list(id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), start = structure(c(1676691000, 1676691000, 1676691000, 1676691000,
1676691000, 1676691000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), home = c("San José St Spartans", "San José St Spartans",
"San José St Spartans", "San José St Spartans", "San José St Spartans",
"San José St Spartans"), away = c("New Mexico Lobos", "New Mexico Lobos",
"New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos", "New Mexico Lobos"
), team = c("San José St Spartans", "New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos"
), price = c(-140, 120, -140, 120, -138, 117), update = c("2023-02-18T00:24:43Z",
"2023-02-18T00:24:43Z", "2023-02-18T00:25:10Z", "2023-02-18T00:25:10Z",
"2023-02-18T00:25:04Z", "2023-02-18T00:25:04Z"), bep = c(0.58333,
0.45455, 0.58333, 0.45455, 0.57983, 0.46083), no_vig = c(-128.33333,
128.33333, -128.33333, 128.33333, -125.82353, 125.82353), no_vig_bep = c(0.56204,
0.43796, 0.56204, 0.43796, 0.55718, 0.44282), win = c(71.43,
120, 71.43, 120, 72.46, 117)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
id = c("5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51",
"5d8f6b2536fbdc4ab6a3e9759ebc6c51", "5d8f6b2536fbdc4ab6a3e9759ebc6c51"
), book = c("BetUS", "BetUS", "Bovada", "Bovada", "Pinnacle",
"Pinnacle"), team = c("New Mexico Lobos", "San José St Spartans",
"New Mexico Lobos", "San José St Spartans", "New Mexico Lobos",
"San José St Spartans"), .rows = structure(list(2L, 1L,
4L, 3L, 6L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
The following is the calculation
-4.482633 = (.55718 * 71.43) - (.44282 * 100)
The values in the calculation above correspond with the following variables
-4.482633 = Expected Value I am trying to derive
.55718 = "no_vig_bep" of Pinnacle
71.43 = "win" of observation 1
.44282 = 1 - "no_vig_bep" of Pinnacle or the last row
100 = a set amount
I Would then like to calculate the other side of the odds as follow
-2.5796 = (.44282 * 120.00) - (.55718 * 100)
The ultimate goal is to use the values of the Pinnacle book to perform the above calculation against all other books. The EV will be written to a new column.
Included additional id for further clarification
structure(list(id = c("073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70",
"073c154f3c8586868a3ba21522161a70", "073c154f3c8586868a3ba21522161a70"
), book = c("Bovada", "Pinnacle", "MyBookie.ag", "MyBookie.ag",
"Pinnacle", "Bovada"), home = c("Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos", "Western Michigan Broncos", "Western
Michigan Broncos",
"Western Michigan Broncos"), away = c("Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals", "Ball State Cardinals",
"Ball State Cardinals", "Ball State Cardinals"), team = c("Western
Michigan Broncos",
"Ball State Cardinals", "Western Michigan Broncos", "Ball State
Cardinals",
"Western Michigan Broncos", "Ball State Cardinals"), price = c(-185,
-143, -142, 100, 108, 140), bep = c(0.64912, 0.58848, 0.58678,
0.5, 0.48077, 0.41667), no_vig = c(-155.78947, -122.40329, -117.35537,
117.35537, 122.40329, 155.78947), no_vig_bep = c(0.60905, 0.55037,
0.53992, 0.46008, 0.44963, 0.39095), win = c(54.05, 69.93, 70.42,
100, 108, 140), EV_1 = c(-15.2155015, -6.47562589999999,
-6.20594459999999,
-10.074, -6.47696000000001, 7.91119999999999)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups =
structure(list(
book = c("Bovada", "MyBookie.ag", "Pinnacle"), .rows = structure(list(
c(1L, 6L), 3:4, c(2L, 5L)), ptype = integer(0), class =
c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE))

Edit1: Maybe purrr can help you here:
library(purrr)
no_vig_pin_list <- df |>
ungroup() |>
split(df$id) |>
map(~.x |> filter(book == "Pinnacle") |> pull(no_vig_bep))
df |>
ungroup() |>
group_split(id) |>
purrr::map2_dfr(no_vig_pin_list, ~ .x |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(.y[1] * win)- ((1-.y[1])*100),
(.y[2] * win)- ((1-.y[2])*100)))) |>
select(EV_1)
A tibble: 8 × 2
# Groups: book [2]
book EV_1
<chr> <dbl>
1 BetUS -5.03
2 BetUS -1.10
3 Pinnacle -4.07
4 Pinnacle -4.07
5 BetUS -4.48
6 BetUS -2.58
7 Pinnacle -3.91
8 Pinnacle -3.91
Maybe this helps, I am not quite sure that I understood what you are trying to achieve.
library(dplyr)
no_vig_pin <- df |>
filter(book == "Pinnacle") |>
pull(no_vig_bep)
df |>
group_by(book) |>
mutate(EV_1 = ifelse(row_number() == 1,
(no_vig_pin[1] * win)- ((1-no_vig_pin[1])*100),
(no_vig_pin[2] * win)- ((1-no_vig_pin[2])*100))) |>
select(EV_1)
Output:
# A tibble: 6 × 2
# Groups: book [3]
book EV_1
<chr> <dbl>
1 BetUS -4.48
2 BetUS -2.58
3 Bovada -4.48
4 Bovada -2.58
5 Pinnacle -3.91
6 Pinnacle -3.91

Related

Extracting and evaluating words in a text string against another dataset

I have two sets of data that I will be evaluating against one another. A heavily reduced example looks like this:
library(dplyr)
library(tidyverse)
library(sqldf)
library(dbplyr)
library(httr)
library(purrr)
library(jsonlite)
library(magrittr)
library(tidyr)
library(tidytext)
people_records_ex <- structure(list(id = c(123L, 456L, 789L), name = c("Anna Wilson",
"Jeff Smith", "Craig Mills"), biography = c("Student at Ohio State University. Class of 2024.",
"Second year law student at Stanford. Undergrad at William & Mary",
"University of North Texas Volleyball!")), class = "data.frame", row.names = c(NA,
-3L))
college_records_ex <- structure(list(college_id = c(234L, 567L, 891L, 345L), college_name = c("Ohio State University",
"Stanford", "William & Mary", "University of North Texas"), college_city = c("Columbus",
"Stanford", "Williamsburg", "Denton"), college_state = c("OH",
"CA", "VA", "TX")), class = "data.frame", row.names = c(NA, -4L
))
I am trying to create a match against the contents of the biography text string in people_records_ex against college_name in college_records_ex so the final output will look like this:
final_records_ex <- structure(list(id = c(123L, 456L, 456L, 789L), name = c("Anna Wilson",
"Jeff Smith", "Jeff Smith", "Craig Mills"), college_name = c("Ohio State University",
"Stanford", "William & Mary", "University of North Texas"), college_city = c("Columbus",
"Stanford", "Williamsburg", "Denton"), college_state = c("OH",
"CA", "VA", "TX")), class = "data.frame", row.names = c(NA, -4L
))
Or to provide a more visual example of the final output I'm expecting:
But when I run the following code, it produces zero results, which is not correct:
college_extract <- people_records_ex %>%
left_join(college_records_ex, by = c("biography" = "college_name")) %>%
filter(!is.na(college_state)) %>% dplyr::select(id, name, college_name, college_city, college_state) %>% distinct()
What am I doing incorrectly and what would the correct version look like?

Here's a very tidy and straightforward solution with fuzzy_join:
library(fuzzyjoin)
library(stringr)
library(dplyr)
fuzzy_join(
people_records_ex, college_records_ex,
by = c("biography" = "college_name"),
match_fun = str_detect,
mode = "left"
) %>%
select(-biography)
id name college_id college_name college_city college_state
1 123 Anna Wilson 234 Ohio State University Columbus OH
2 456 Jeff Smith 567 Stanford Stanford CA
3 456 Jeff Smith 891 William & Mary Williamsburg VA
4 789 Craig Mills 345 University of North Texas Denton TX

Assuming the college names in the biographies are spelled out exactly as they appear in the colleges table and the datasets are relatively small, all matches can be generated with a regex of all college names as follows
library(dplyr)
people_records_ex <- structure(list(id = c(123L, 456L, 789L), name = c(
"Anna Wilson",
"Jeff Smith", "Craig Mills"
), biography = c(
"Student at Ohio State University. Class of 2024.",
"Second year law student at Stanford. Undergrad at William & Mary",
"University of North Texas Volleyball!"
)), class = "data.frame", row.names = c(
NA,
-3L
)) %>% tibble::tibble()
college_records_ex <- structure(list(college_id = c(234L, 567L, 891L, 345L), college_name = c(
"Ohio State University",
"Stanford", "William & Mary", "University of North Texas"
), college_city = c(
"Columbus",
"Stanford", "Williamsburg", "Denton"
), college_state = c(
"OH",
"CA", "VA", "TX"
)), class = "data.frame", row.names = c(NA, -4L)) %>%
tibble::tibble()
# join college names in a regex pattern
colleges_regex <- paste0(college_records_ex$college_name, collapse = "|")
colleges_regex
#> [1] "Ohio State University|Stanford|William & Mary|University of North Texas"
# match all against bio, giving a list-column of matches
people_records_ex %>%
mutate(matches = stringr::str_match_all(biography, colleges_regex))
#> # A tibble: 3 × 4
#> id name biography matches
#> <int> <chr> <chr> <list>
#> 1 123 Anna Wilson Student at Ohio State University. Class of 2024. <chr[…]>
#> 2 456 Jeff Smith Second year law student at Stanford. Undergrad at … <chr[…]>
#> 3 789 Craig Mills University of North Texas Volleyball! <chr[…]>
# unnest the list column wider to give 1 row per person per match
people_records_ex %>%
mutate(matches = stringr::str_match_all(biography, colleges_regex)) %>%
tidyr::unnest_longer(matches)
#> # A tibble: 4 × 4
#> id name biography match…¹
#> <int> <chr> <chr> <chr>
#> 1 123 Anna Wilson Student at Ohio State University. Class of 2024. Ohio S…
#> 2 456 Jeff Smith Second year law student at Stanford. Undergrad at W… Stanfo…
#> 3 456 Jeff Smith Second year law student at Stanford. Undergrad at W… Willia…
#> 4 789 Craig Mills University of North Texas Volleyball! Univer…
#> # … with abbreviated variable name ¹matches[,1]
Created on 2022-10-26 with reprex v2.0.2
This may be joined back to the college table such that it is annotated with college info.

In base R you can do:
do.call(rbind, lapply(college_records_ex$college_name,
\(x) people_records_ex[grep(x, people_records_ex$biography),1:2])) |>
cbind(college_records_ex[-1])
This does some matching and I subsetted the first two columns which are the id and name, cbinding it with the second data.frame getting rid of the first column
id name college_name college_city college_state
1 123 Anna Wilson Ohio State University Columbus OH
2 456 Jeff Smith Stanford Stanford CA
21 456 Jeff Smith William & Mary Williamsburg VA
3 789 Craig Mills University of North Texas Denton TX

Turning horizontal data into vertical data

I have data that is structure like such:
actor_data <- structure(list(id = c(123L, 456L, 789L, 912L, 235L), name = c("Tom Cruise",
"Will Smith", "Ryan Reynolds", "Chris Rock", "Emma Stone"), locationid1 = c(5459L,
NA, 6114L, NA, NA), location1 = c("Paris, France", "", "Brooklyn, NY",
"", ""), locationid2 = c(NA, 5778L, NA, NA, 4432L), location3 = c("",
"Dolby Theater", "", "", "Hollywood"), locationid3 = c(NA, 2526L,
3101L, NA, NA), location3.1 = c("", "London", "Boston", "", ""
), locationid4 = c(6667L, 2333L, 1118L, NA, NA), location4 = c("Virginia",
"Maryland", "Washington", "", "")), class = "data.frame", row.names = c(NA,
-5L))
I am trying to make the location data run vertically instead of horizontally while also making sure its not accounting for blank fields.
So the final result will look like this:
actor_data_exp <- structure(list(id = c(123L, 123L, 456L, 456L, 456L, 789L, 789L,
789L, 235L), name = c("Tom Cruise", "Tom Cruise", "Will Smith",
"Will Smith", "Will Smith", "Ryan Reynolds", "Ryan Reynolds",
"Ryan Reynolds", "Emma Stone"), locationid = c(5459L, 6667L,
5778L, 2526L, 2333L, 6114L, 3101L, 1118L, 4432L), location = c("Paris, France",
"Virginia", "Dolby Theater", "London", "Maryland", "Brooklyn, NY",
"Boston", "Washington", "Hollywood")), class = "data.frame", row.names = c(NA,
-9L))
Or to give you a visual, will end up looking like this:

I would rename the columns that start with "location", so that there is an underscore before the number in each column name. Then use pivot_longer with the underscore as name_sep, and using c(".value", "var") in the names_to argument that to ensure both location and locationid have their own columns. This will also create the redundant column var which will contain the numbers 1-4 that were appended to the original column names.
Finally, filter out missing values and remove the redundant var column.
library(tidyverse)
actor_data %>%
rename_with(~ ifelse(grepl("location", .x),
sub("^(.*?)([0-9]\\.?\\d?)$", "\\1_\\2", .x), .x)) %>%
pivot_longer(starts_with("location"),
names_sep = "_", names_to = c(".value", "var")) %>%
filter(!is.na(locationid) & !is.na(location) & nzchar(location)) %>%
select(-var)
#> # A tibble: 6 x 4
#> id name locationid location
#> <int> <chr> <int> <chr>
#> 1 123 Tom Cruise 5459 Paris, France
#> 2 123 Tom Cruise 6667 Virginia
#> 3 456 Will Smith 2526 Dolby Theater
#> 4 456 Will Smith 2333 Maryland
#> 5 789 Ryan Reynolds 6114 Brooklyn, NY
#> 6 789 Ryan Reynolds 1118 Washington

Align max & min percent changes in a geom_bar plot

I am not sure if this is possible in ggplot or not, but I have been tinkering with this for quite some time and am unable to figure it out.
I am trying to create a plot that mimics this layout:
I am pretty sure I have my data formatted correctly. This is what I have done so far:
library(tidyverse)
library(reshape2)
library(lubridate)
rental.data.melted <- melt(rental_data)
rental.data.melted <- rental.data.melted %>%
slice(217:10908)
rental.data.melted <- rental.data.melted %>%
rename(date = variable)
rental.data.melted$date <- lubridate::ym(rental.data.melted$date)
rental.one.year <- rental.data.melted %>%
filter(year(date) >= 2021 & month(date) >= 3)
rental.one.year <- rental.one.year %>%
group_by(RegionName) %>%
mutate(prev_rent = lag(value),
pct.chg = (value / prev_rent - 1) * 100)
one.year.results <- rental.one.year %>%
filter(year(date) == 2022)
one.year.results <- one.year.results %>%
filter(RegionName %in% c("Daytona Beach, FL", "Miami-Fort Lauderdale, FL", "Lakeland, FL", "New York, NY",
"North Port-Sarasota-Bradenton, FL", "Syracuse, NY", "Tulsa, OK", "McAllen, TX"))
The resulting data frame looks like this:
> as.tibble(one.year.results)
# A tibble: 8 x 5
RegionName date value prev_rent pct.chg
<chr> <date> <dbl> <dbl> <dbl>
1 New York, NY 2022-03-01 2934 2804 4.64
2 Miami-Fort Lauderdale, FL 2022-03-01 2832 2699 4.93
3 Tulsa, OK 2022-03-01 1286 1294 -0.618
4 McAllen, TX 2022-03-01 1017 1020 -0.294
5 North Port-Sarasota-Bradenton, FL 2022-03-01 2402 2488 -3.46
6 Syracuse, NY 2022-03-01 1318 1334 -1.20
7 Lakeland, FL 2022-03-01 1808 1725 4.81
8 Daytona Beach, FL 2022-03-01 1766 1680 5.12
As for the plotting, this is what I am working with so far but I cannot figure out how to get the bars "aligned" like in the example above so that the metro area with the biggest decrease (North Port-Sarasota, FL) is aligned with the metro with the largest increase (Daytona Beach, FL):
ggplot(data = one.year.results, aes(pct.chg)) +
geom_bar(data = subset(one.year.results, pct.chg > 0),
aes(y = RegionName), stat = "identity") +
geom_bar(data = subset(one.year.results, pct.chg < 0),
aes(y = RegionName), stat = "identity")
As well, here is the data in the reproducible form:
structure(list(RegionName = c("New York, NY", "Miami-Fort Lauderdale, FL",
"Tulsa, OK", "McAllen, TX", "North Port-Sarasota-Bradenton, FL",
"Syracuse, NY", "Lakeland, FL", "Daytona Beach, FL"), date = structure(c(19052,
19052, 19052, 19052, 19052, 19052, 19052, 19052), class = "Date"),
value = c(2934, 2832, 1286, 1017, 2402, 1318, 1808, 1766),
prev_rent = c(2804, 2699, 1294, 1020, 2488, 1334, 1725, 1680
), pct.chg = c(4.63623395149786, 4.92775101889589, -0.618238021638329,
-0.294117647058822, -3.45659163987139, -1.19940029985007,
4.81159420289856, 5.11904761904762)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -8L), groups = structure(list(
RegionName = c("Daytona Beach, FL", "Lakeland, FL", "McAllen, TX",
"Miami-Fort Lauderdale, FL", "New York, NY", "North Port-Sarasota-Bradenton, FL",
"Syracuse, NY", "Tulsa, OK"), .rows = structure(list(8L,
7L, 4L, 2L, 1L, 5L, 6L, 3L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))

library(tidyverse)
data <- structure(list(
RegionName = c(
"New York, NY", "Miami-Fort Lauderdale, FL",
"Tulsa, OK", "McAllen, TX", "North Port-Sarasota-Bradenton, FL",
"Syracuse, NY", "Lakeland, FL", "Daytona Beach, FL"
), date = structure(c(
19052,
19052, 19052, 19052, 19052, 19052, 19052, 19052
), class = "Date"),
value = c(2934, 2832, 1286, 1017, 2402, 1318, 1808, 1766),
prev_rent = c(2804, 2699, 1294, 1020, 2488, 1334, 1725, 1680), pct.chg = c(
4.63623395149786, 4.92775101889589, -0.618238021638329,
-0.294117647058822, -3.45659163987139, -1.19940029985007,
4.81159420289856, 5.11904761904762
)
), class = c(
"grouped_df",
"tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L), groups = structure(list(
RegionName = c(
"Daytona Beach, FL", "Lakeland, FL", "McAllen, TX",
"Miami-Fort Lauderdale, FL", "New York, NY", "North Port-Sarasota-Bradenton, FL",
"Syracuse, NY", "Tulsa, OK"
), .rows = structure(list(
8L,
7L, 4L, 2L, 1L, 5L, 6L, 3L
), ptype = integer(0), class = c(
"vctrs_list_of",
"vctrs_vctr", "list"
))
), row.names = c(NA, -8L), class = c(
"tbl_df",
"tbl", "data.frame"
), .drop = TRUE))
data %>%
group_by(sign(pct.chg)) %>%
arrange(-abs(pct.chg)) %>%
slice(1:3) %>%
mutate(position = row_number()) %>%
ggplot(aes(position, pct.chg)) +
geom_col() +
geom_label(aes(label = RegionName)) +
geom_hline(yintercept = 0) +
coord_flip()
Created on 2022-04-28 by the reprex package (v2.0.0)

How can I add columns to a data frame with a value determined by values in other columns?

I have a dataset on population in counties in the US. I want to add a column for what state the county is in and one for the county code. Both are already available in the dataset but "hid".
For instance, from the output we can see that the first observation says NAME = "Ada County, Idaho" and GEOID = "16001". I want one column with State = "Idaho" and one column with StateID = "16".
Thank you!
structure(list(NAME = c("Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho", "Ada County, Idaho", "Ada County, Idaho",
"Ada County, Idaho"), GEOID = c("16001", "16001", "16001", "16001",
"16001", "16001"), year = c("2007", "2007", "2007", "2007", "2007",
"2007"), POP25 = c(205888, 205888, 205888, 205888, 205888, 205888
), EMPLOY25 = c(205888, 208506, 212770, 212272, 216058, 220856
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), groups = structure(list(NAME = "Ada County, Idaho", GEOID = "16001",
.rows = structure(list(1:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))

Perhaps this helps - remove the substring in 'NAME' till the , followed by one or more spaces (\\s+) to create the 'State' and the 'StateID' from the first two characters of 'GEOID' column using substr
library(dplyr)
library(stringr)
df1 %>%
ungroup %>%
mutate(State = str_remove(NAME, ".*,\\s+"),
StateID = substr(GEOID, 1, 2))

Here is an alternative using str_extract and str_sub:
library(dplyr)
library(stringr)
pattern <- paste(state.name, collapse="|")
df %>%
mutate(State = str_extract(NAME, pattern),
StateID = str_sub(GEOID, 1, 2))
NAME GEOID year POP25 EMPLOY25 State StateID
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
1 Ada County, ~ 16001 2007 205888 205888 Idaho 16
2 Ada County, ~ 16001 2007 205888 208506 Idaho 16
3 Ada County, ~ 16001 2007 205888 212770 Idaho 16
4 Ada County, ~ 16001 2007 205888 212272 Idaho 16
5 Ada County, ~ 16001 2007 205888 216058 Idaho 16
6 Ada County, ~ 16001 2007 205888 220856 Idaho 16

Case_when adding on a function

I'm trying to write a function in R that generates three values that loops over each row of a df, checks the value of one column then, if it meets this condition, adds the value of the value of one column for this row to a value.
I thought that using the case_when construction would work best for this, but should I be using an lapply constructon instead ?
get_home_away_goals_for_team <- function(matches_df, team_list){
complete_df <- data.frame(team = character(), goals_scored= double(),goals_conceded = double(), games_played = double())
for (team in teams){
print(team)
goals_scored <- 0
goals_conceded <- 0
games_played <- 0
case_when(
matches_df$home == team ~ goals_scored = goals_scored + matches_df$hg,
matches_df$home == team ~ goals_conceded = goals_conceded + matches_df$ag,
matches_df$away == team ~ goals_scored = goals_scored + matches_df$ag,
matches_df$away == team ~ goals_conceded = goals_conceded + matches_df$hg,
matches_df$home == team ~ games_played = games_played + 1,
matches_df$away == team ~ games_played = games_played + 1)
temp_get_goals_df = data.frame(team,goals_scored,goals_conceded,games_played)
complete_df <- rbind(complete_df,temp_get_goals_df)
}
complete_df
}
The function takes a value of team, checks for each row whether this team was playing in a game home or away, then adds to the values of goals scored accordingly.
When I try to use the function though, I get the error Error: unexpected '}' in " }" which makes me think I'm using case_when incorrectly.
Is this the case ?
Data:
matches_df_example :
structure(list(home = c("Colorado Rapids", "Vancouver Whitecaps",
"DC United", "Los Angeles Galaxy", "San Jose Earthquakes", "FC Dallas"
), away = c("Columbus Crew", "Club de Foot Montreal", "Sporting Kansas City",
"Real Salt Lake", "New England Revolution", "New York Red Bulls"
), res = c("H", "H", "A", "A", "H", "H"), season = c(2012, 2012,
2012, 2012, 2012, 2012), hg = c(2, 2, 0, 1, 1, 2), ag = c(0,
0, 1, 3, 0, 1), date_time = structure(c(1331420400, 1331420400,
1331425800, 1331436600, 1331436600, 1331492400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), home_conference = c("West", "West", "East", "West",
"West", "West"), away_conference = c("East", "East", "East",
"West", "East", "East")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
teams
c("Colorado Rapids", "Vancouver Whitecaps", "DC United", "Los Angeles Galaxy",
"San Jose Earthquakes", "FC Dallas", "Chivas USA", "Portland Timbers",
"Club de Foot Montreal", "Sporting Kansas City", "Real Salt Lake",
"Seattle Sounders", "Philadelphia Union", "Toronto FC", "Columbus Crew",
"New England Revolution", "Chicago Fire", "New York Red Bulls",
"Houston Dynamo", "Orlando City", "New York City", "Atlanta United",
"Minnesota United", "Los Angeles FC", "FC Cincinnati", "Atlanta Utd",
"Nashville SC", "Inter Miami", "Austin FC")

Perhaps this?
homes <- matches_df %>%
group_by(team = home) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
aways <- matches_df %>%
group_by(team = away) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
full_join(homes, aways, by = "team", suffix = c("", ".y")) %>%
full_join(tibble(team = teams), by = "team") %>%
transmute(
team,
goals_scored = coalesce(goals_scored, goals_scored.y, 0),
goals_conceded = coalesce(goals_conceded, goals_conceded.y, 0),
games_played = coalesce(games_played, games_played.y, 0)
)
# # A tibble: 29 x 4
# team goals_scored goals_conceded games_played
# <chr> <dbl> <dbl> <dbl>
# 1 Colorado Rapids 2 0 1
# 2 DC United 0 1 1
# 3 FC Dallas 2 1 1
# 4 Los Angeles Galaxy 1 3 1
# 5 San Jose Earthquakes 1 0 1
# 6 Vancouver Whitecaps 2 0 1
# 7 Club de Foot Montreal 2 0 1
# 8 Columbus Crew 2 0 1
# 9 New England Revolution 1 0 1
# 10 New York Red Bulls 2 1 1
# # ... with 19 more rows

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Trying to calculate the Expected Value of an observation - r

Related

Extracting and evaluating words in a text string against another dataset

Turning horizontal data into vertical data

Align max & min percent changes in a geom_bar plot

How can I add columns to a data frame with a value determined by values in other columns?

Case_when adding on a function

Categories

Resources