Calculation of the cumulative points before the event/game - r

I would like to cumulate the points of several football clubs for each one for a match day.
I have created a sample dataset to explain the problem:
t <- data.frame(Heim = c("A", "B", "B", "D", "C", "A", "C", "D", "A", "B", "B", "D", "C", "A", "C", "D"),
Auswärts = c("C", "D", "A", "C", "B", "D", "A", "B", "C", "D", "A", "C", "B", "D", "A", "B"),
Ergebnis= c("S", "U", "N", "N", "S", "S", "N", "U", "N", "S", "N", "U", "S", "S", "U", "U"),
Round = c(1,1,2,2,3,3,4,4,1,1,2,2,3,3,4,4),
Saison = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2))
My idea was for each club (in the origin data set more than 4) a separate column with the score and a cummulated column to it.
So something like this:
t$A_Points <- ifelse(t$Heim =="A" & t$Ergebnis =="S", 3, 0)
t$A_Points <- ifelse(t$Heim =="A" & t$Ergebnis =="U", 1, t$A_Points )
t$A_Points <- ifelse(t$Auswärts =="A" & t$Ergebnis =="U", 1, t$A_Points )
t$A_Points <- ifelse(t$Auswärts =="A" & t$Ergebnis =="N", 3, t$A_Points )
t$A_Points <- ifelse(t$Auswärts !="A" & t$Heim !="A", NA, t$A_Points)
t$A<- ifelse(t$A_Points == "NA", 0, 1)
t<- t %>%
arrange(Saison,Round,A) %>%
group_by(Saison, A) %>%
mutate(cumsum = cumsum(A_Points))
Unfortunately, it is very time and space-consuming even for 4 clubs...
Also, I would like to have the sum of the points of the matches without the result of the current matchday.
The optimal result for me would be following:
Heim Auswärts Ergebnis Round Saison Points_Heim Points_Auswärts
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A C S 1 1 0 0
2 B D U 1 1 0 0
3 B A N 2 1 1 3
4 D C N 2 1 1 0
5 A D S 3 1 6 1
6 C B S 3 1 3 1
7 C A N 4 1 6 9
8 D B U 4 1 1 1
9 A C N 1 2 0 0
10 B D S 1 2 0 0
11 B A N 2 2 3 0
12 D C U 2 2 0 3
13 A D S 3 2 3 1
14 C B S 3 2 4 3
15 C A U 4 2 7 3
16 D B U 4 2 1 3
I would be very happy about an idea for an easier solution.

Probably not the shortest solution. But I would do
t <- t %>%
group_by(Saison) %>%
mutate(Heim_Points_Veränderung = case_when(Ergebnis == "S" ~ 3,
Ergebnis == "U" ~ 1,
Ergebnis == "N" ~ 0),
Auswärts_Points_Veränderung = case_when(Ergebnis == "S" ~ 0,
Ergebnis == "U" ~ 1,
Ergebnis == "N" ~ 3),
Points_Heim = 0,
Points_Auswärts = 0)
for (i in unique(union(t$Heim, t$Auswärts))){
t <- t %>%
mutate(!!sym(paste0(i,"_points")) := if_else(Heim == i, Heim_Points_Veränderung, 0),
!!sym(paste0(i,"_points")) := if_else(Auswärts == i, Auswärts_Points_Veränderung, !!sym(paste0(i,"_points"))),
!!sym(paste0(i,"_cumsum")) := cumsum(lag(!!sym(paste0(i,"_points")), default=0)),
Points_Heim = if_else(Heim == i, !!sym(paste0(i,"_cumsum")), Points_Heim),
Points_Auswärts = if_else(Auswärts == i, !!sym(paste0(i,"_cumsum")), Points_Auswärts))
}
t <- t %>%
select(Heim, Auswärts, Ergebnis, Round, Saison, Points_Heim, Points_Auswärts)
Output
> t
# A tibble: 16 x 7
# Groups: Saison [2]
Heim Auswärts Ergebnis Round Saison Points_Heim Points_Auswärts
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A C S 1 1 0 0
2 B D U 1 1 0 0
3 B A N 2 1 1 3
4 D C N 2 1 1 0
5 C B S 3 1 3 1
6 A D S 3 1 6 1
7 C A N 4 1 6 9
8 D B U 4 1 1 1
9 A C N 1 2 0 0
10 B D S 1 2 0 0
11 B A N 2 2 3 0
12 D C U 2 2 0 3
13 C B S 3 2 4 3
14 A D S 3 2 3 1
15 C A U 4 2 7 6
16 D B U 4 2 1 3
This solution should adapt to any number of clubs. Briefly, I store the possible change in points in Heim/Auswärts_Points_Veränderung using case_when (easier than a lot of ifelse) so I can create a column of points change for each club (running a for loop on all the clubs). This allows me to do a cumsum like you using lag to make sure the cumsum is updated 1 row later (to display the cumulative points before the match and not after), which I enter in the Points_Heim/Auswärts column only when the club is displayed in the Heim/Auswärts columns. The key to my solution is to use !!sym to feed dynamic variable names to mutate (note the assignment with :=) inside the for loop.

Related

Replace column based on column names

I have 65 columns, but a sample of data could be as follows:
df<-read.table (text=" Name D A D E
Rose D D C B
Smith B A D D
Lora A A D D
Javid A D D B
Ahmed C A E A
Helen B A D D
Nadia A A D A
", header=TRUE)
I want to get the following table:
Name D A D E
Rose 2 1 1 1
Smith 1 2 2 1
Lora 1 2 2 1
Javid 1 1 2 1
Ahmed 1 2 1 1
Helen 1 2 2 1
Nadia 1 2 2 1
The numbers follow the first raw. For example, the second column is D, so all Ds should read 2 and else should read 1. Or in the third column, which is A, all As should read 2 and else should read 1 and so on. Please consider I have 65 columns. I understand I should have different names for the columns, but In this case, I cannot change them as you understand it.
With ifelse and sapply:
df[2:ncol(df)] <- sapply(2:ncol(df), function(i) ifelse(df[i] == colnames(df[i]), 2, 1))
output
#> df
Name D A D E
1 Rose 2 1 1 1
2 Smith 1 2 2 1
3 Lora 1 2 2 1
4 Javid 1 1 2 1
5 Ahmed 1 2 1 1
6 Helen 1 2 2 1
7 Nadia 1 2 2 1
data
df <- structure(list(Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed",
"Helen", "Nadia"), D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A"), D = c("C", "D",
"D", "D", "E", "D", "D"), E = c("B", "D", "D", "B", "A",
"D", "A")), class = "data.frame", row.names = c(NA, -7L))
cols = names(df)[-1]
df[cols] = lapply(cols, \(x) (df[[x]] == x) + 1L)
# Name D A
# 1 Rose 2 1
# 2 Smith 1 2
# 3 Lora 1 2
# 4 Javid 1 1
# 5 Ahmed 1 2
# 6 Helen 1 2
# 7 Nadia 1 2
Simplified data (without repeated column names)
df <- data.frame(
Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed", "Helen", "Nadia"),
D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A")
)
Another approach, you can stack, replace and unstack, i.e
stack_df <- stack(df[-1])
stack_df$values <- ifelse(stack_df$values == stack_df$ind, 2, 1)
cbind.data.frame(Name = df$Name, unstack(stack_df))
# Name D A E
#1 Rose 2 1 1
#2 Smith 1 2 1
#3 Lora 1 2 1
#4 Javid 1 1 1
#5 Ahmed 1 2 1
#6 Helen 1 2 1
#7 Nadia 1 2 1
DATA
structure(list(Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed",
"Helen", "Nadia"), D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A"), E = c("B", "D",
"D", "B", "A", "D", "A")), row.names = c(NA, -7L), class = "data.frame")
dplyr option with ifelse like this:
library(dplyr)
df %>%
mutate(across(D:E, ~ifelse(. == cur_column(), 2, 1)))
#> Name D A D.1 E
#> 1 Rose 2 1 1 1
#> 2 Smith 1 2 1 1
#> 3 Lora 1 2 1 1
#> 4 Javid 1 1 1 1
#> 5 Ahmed 1 2 1 1
#> 6 Helen 1 2 1 1
#> 7 Nadia 1 2 1 1
Created on 2022-09-19 with reprex v2.0.2
Using dplyr:
The data:
df <- read.table (
text = " Name A B C D
Rose D D C B
Smith B A D D
Lora A A D D
Javid A D D B
Ahmed C A E A
Helen B A D D
Nadia A A D A
",
header = TRUE
)
> df
Name A B C D
1 Rose D D C B
2 Smith B A D D
3 Lora A A D D
4 Javid A D D B
5 Ahmed C A E A
6 Helen B A D D
7 Nadia A A D A
Note that i changed the column names
df %>%
mutate(across(!c(Name),
.fns = ~ ifelse(.x == cur_column(), 2, 1)))
Name A B C D
1 Rose 1 1 2 1
2 Smith 1 1 1 2
3 Lora 2 1 1 2
4 Javid 2 1 1 1
5 Ahmed 1 1 1 1
6 Helen 1 1 1 2
7 Nadia 2 1 1 1
The mutate-command modifies columns in dataframes. Using the across()-function we specify that the mutation should be applied to more than one column. inside the across-call, we select every column but the name column using !c(Name). We then specify a function that compares the name of the column cur_column() with the values in the column .x. If they are the same, set the value to 2, else to 1.
EDIT: used ifelse instead of case_when as there is only one condition to check
You can compare each row with the column names. Adding 1 to the logical values converts FALSE and TRUE into 1 and 2 respectively.
df[-1] <- t((t(df[-1]) == names(df)[-1]) + 1)
df
# Name D A D E
# 1 Rose 2 1 1 1
# 2 Smith 1 2 2 1
# 3 Lora 1 2 2 1
# 4 Javid 1 1 2 1
# 5 Ahmed 1 2 1 1
# 6 Helen 1 2 2 1
# 7 Nadia 1 2 2 1

Splitting single column into four columns and count repeated pattern in R

Aim of this project is understand how information is acquired while looking into an object. Imagine an object has elements like a, b, c, d, e and f. A person might look at a and move onto to b and so forth. Now, we wish to plot and understand how that person have navigated across the different elements of a given stimuli. I have data that captured this movement in a single column but I need split this into few columns to get the navigation pattern. Please find the example given below.
I have column extracted from a data frame. Now it has to be split into four columns based on its characteristics.
a <- c( "a", "b", "b", "b", "a", "c", "a", "b", "d", "d", "d", "e", "f", "f", "e", "e", "f")
a <- as.data.frame(a)
Expected output
from to countfrom countto
a b 1 3
b a 3 1
a c 1 1
c a 1 1
a b 1 1
b d 1 3
d e 3 1
e f 1 2
f e 2 2
e f 2 1
Note: I used dplyr to extract from the dataframe.
Use rle to get the relative runs of each letter, and then piece it together:
r <- rle(a$a)
## or maybe `r <- rle(as.character(a$a)` depending on your R version
setNames(
data.frame(lapply(r, head, -1), lapply(r, tail, -1)),
c("countfrom","from","countto","to")
)
## countfrom from countto to
##1 1 a 3 b
##2 3 b 1 a
##3 1 a 1 c
##4 1 c 1 a
##5 1 a 1 b
##6 1 b 3 d
##7 3 d 1 e
##8 1 e 2 f
##9 2 f 2 e
##10 2 e 1 f
Or in the tidyverse
library(tidyverse)
a <- c( "a", "b", "b", "b", "a", "c", "a", "b", "d",
"d", "d", "e", "f", "f", "e", "e", "f")
foo <- rle(a)
answ <- tibble(from = foo$values, to = lead(foo$values),
fromCount = foo$lengths, toCount = lead(foo$lengths)) %>%
filter(!is.na(to))
# A tibble: 10 x 4
from to fromCount toCount
<chr> <chr> <int> <int>
1 a b 1 3
2 b a 3 1
3 a c 1 1
4 c a 1 1
5 a b 1 1
6 b d 1 3
7 d e 3 1
8 e f 1 2
9 f e 2 2
10 e f 2 1

Enumerate a grouping variable in a tibble

I would like to know how to use row_number or anything else to transform a variable group into a integer
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number())
But I would like to have this output:
# A tibble: 10 x 4
A group G1 G2
<chr> <chr> <dbl> <dbl>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4
My question is: how to get this column G2, I know i could transform the 'group' var into a factor then integer (after the tibble is arranged) but I would like to know if it can be done using a counting.
You just need one more step and include the group indices with group_indices(). Be aware that how your data is arranged/sorted will affect the index.
library(dplyr)
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number(),
G2 = group_indices())
# A tibble: 10 x 4
# Groups: group [4]
A group G1 G2
<chr> <chr> <int> <int>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4

How to add a boolean value to a column when 2 different dataframes match on 2 columns in R?

I have 2 different dataframes. I want add a column to my second dataframe and have it assigned a value 0 or 1. In the case where df1$code == df2$code & df1$date == df2$date I want a 0 for these rows. A visual and reproducible example maybe makes it more easy to understand.
df1 <- data.frame(code = c("A", "B", "C", "D"), date = c(1,2,3,4))
df2 <- data.frame(code = c("A", "B", "E", "R", "V", "F"), date = c(1,2,3,4,5,6))
df3 <- data.frame(code = c("A", "B", "E", "R", "V", "F"), date = c(1,2,3,4,5,6), value =c(1,1,0,0,0,0))
DF1
code date
1 A 1
2 B 2
3 C 3
4 D 4
DF2
code date
1 A 1
2 B 2
3 E 3
4 R 4
5 V 5
6 F 6
The resulting DF I want
code date value
1 A 1 1
2 B 2 1
3 E 3 0
4 R 4 0
5 V 5 0
6 F 6 0
We can use %in% to create a logical vector and then coerce it to binary with as.integer or +
df2$value <- +(df2$code %in% df1$code)
df2
# code date value
#1 A 1 1
#2 B 2 1
#3 E 3 0
#4 R 4 0
#5 V 5 0
#6 F 6 0
I would do it like this:
df2 %>% left_join(mutate(df1, value = 1)) %>%
mutate(value = coalesce(value, 0))
# Joining, by = c("code", "date")
# code date value
# 1 A 1 1
# 2 B 2 1
# 3 E 3 0
# 4 R 4 0
# 5 V 5 0
# 6 F 6 0

Filtering Time to Event Data in Tidyverse

I have some time to event data that I'm working with. I'd like to filter the data from the first time the subject is in the study to the first observed event (not worried about the recurrent events that happened after the first event -- only want to explore time to first event).
I'm using a between within a filter function, which has always worked for me in the past but has issues here because there are some subjects that never have the event and thus I get an error that states Error: Expecting a single value: [extent=0].
I think what I want is a method of filtering the data to subject between start of entrance to the study to time to first event OR if there is no event subject all data for the subject.
Here is an example of what my data looks like:
## data
subject <- c("A", "A", "A", "A", "B", "B", "C", "C", "C", "D", "E", "E", "E", "E", "E", "F", "F", "F", "F", "F")
event <- c(0,0,1,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0)
df <- data.frame(subject, event)
## create index to count the days the subject is in the study
library(tidyverse)
df <- df %>%
group_by(subject) %>%
mutate(ID = seq_along(subject))
df
# A tibble: 20 x 3
# Groups: subject [6]
subject event ID
<fct> <dbl> <int>
1 A 0 1
2 A 0 2
3 A 1 3
4 A 0 4
5 B 0 1
6 B 0 2
7 C 0 1
8 C 0 2
9 C 1 3
10 D 0 1
11 E 0 1
12 E 1 2
13 E 0 3
14 E 1 4
15 E 1 5
16 F 0 1
17 F 0 2
18 F 0 3
19 F 0 4
20 F 0 5
## filter event times between the start of the trial and when the subject has the event for the first time
df %>%
group_by(subject) %>%
filter(., between(row_number(),
left = which(ID == 1),
right = which(event == 1)))
The last part is where my error is occurring.
Is this what you're after?
df2 <- df %>%
group_by(subject) %>%
filter(cumsum(event) == 0 | (cumsum(event) == 1 & event == 1))
Result:
# A tibble: 16 x 2
# Groups: subject [6]
subject event
<fct> <dbl>
1 A 0
2 A 0
3 A 1
4 B 0
5 B 0
6 C 0
7 C 0
8 C 1
9 D 0
10 E 0
11 E 1
12 F 0
13 F 0
14 F 0
15 F 0
16 F 0

Resources