Calculation of a new variable in R - r

this is the dataframe
name team stat1 stat2
a aa 1 4
b aa 2 3
c bb 3 2
d bb 4 1
want to calculate a new variable which is calculated
-> (( stat1 of player 'a' / sum of stat1 for that team ) + ( stat2 of player 'a' / sum of stat2 for that team ))
-> ((1/1+2) + (4/4+3))
any idea on how to do this?

We can group by 'team', and then do the calculation to create a new column
library(dplyr)
df1 <- df1 %>%
group_by(team) %>%
mutate(new = (stat1/sum(stat1) + (stat2/sum(stat2)))) %>%
ungroup
-output
df1
# A tibble: 4 × 5
name team stat1 stat2 new
<chr> <chr> <int> <int> <dbl>
1 a aa 1 4 0.905
2 b aa 2 3 1.10
3 c bb 3 2 1.10
4 d bb 4 1 0.905
data
df1 <- structure(list(name = c("a", "b", "c", "d"), team = c("aa", "aa",
"bb", "bb"), stat1 = 1:4, stat2 = 4:1), class = "data.frame",
row.names = c(NA,
-4L))

Related

How to combine two rows of a dataframe into one row

I have a dataframe which looks like this.
Name info.1 info.2
ab a 1
123 a 1
de c 4
456 c 4
fg d 5
789 d 5
The two rows that need to be combined are identical aside from the name column and are together in the dataframe. I want the new dataframe to look like this:
Name ID info.1 info.2
ab 123 a 1
de 456 c 4
fg 789 d 5
I have no clue how to do this and google search hasn't been helpful so far
In base R you could do:
data.frame(Name = df[seq(nrow(df)) %% 2 == 0, 1],
ID = df[seq(nrow(df)) %% 2 == 1, 1],
df[seq(nrow(df)) %% 2 == 0, 2:3])
#> Name ID info.1 info.2
#> 2 ab 456 a 1
#> 4 123 fg c 4
#> 6 de 789 d 5
Created on 2022-07-20 by the reprex package (v2.0.1)
A possible solution:
library(tidyverse)
df %>%
group_by(info.1) %>%
summarise(Name = str_c(Name, collapse = "_"), info.2 = first(info.2)) %>%
separate(Name, into = c("Name", "ID"), convert = T) %>%
relocate(info.1, .before = info.2)
#> # A tibble: 3 × 4
#> Name ID info.1 info.2
#> <chr> <int> <chr> <int>
#> 1 ab 123 a 1
#> 2 de 456 c 4
#> 3 fg 789 d 5
Assuming the Name column is consistently ordered Name-ID-Name-ID then:
library(tidyverse)
data <- tibble(Name = c('ab', 123, 'de', 456, 'fg', 789),
info.1 = c('a', 'a', 'c', 'c', 'd', 'd'),
info.2 = c(1, 1, 4, 4, 5, 5))
# remove the troublesome column and make a tibble
# with the unique combos of info1 and 2
data_2 <- data %>% select(info.1, info.2) %>% distinct()
# add columns for name and ID by skipping every other row in the
# original tibble
data_2$Name <- data$Name[seq(from = 1, to = nrow(data), by = 2)]
data_2$ID <- data$Name[seq(from = 2, to = nrow(data), by = 2)]
We could also use summarise and extract first as name and last as id:
data |>
group_by(info.1, info.2) |>
summarise(name = first(Name), ID = last(Name)) |>
ungroup() #|>
#relocate(3:4,1:2)
Output:
# A tibble: 3 × 4
info.1 info.2 name ID
<chr> <dbl> <chr> <chr>
1 a 1 ab 123
2 c 4 de 456
3 d 5 fg 789
We could also use
library(dplyr)
library(stringr)
data %>%
group_by(across(starts_with('info'))) %>%
mutate(ID = str_subset(Name, "^\\d+$"), .before = 2) %>%
ungroup %>%
filter(str_detect(Name, '^\\d+$', negate = TRUE))
-output
# A tibble: 3 × 4
Name ID info.1 info.2
<chr> <chr> <chr> <dbl>
1 ab 123 a 1
2 de 456 c 4
3 fg 789 d 5
data
data <- structure(list(Name = c("ab", "123", "de", "456", "fg", "789"
), info.1 = c("a", "a", "c", "c", "d", "d"), info.2 = c(1, 1,
4, 4, 5, 5)), row.names = c(NA, -6L), class = "data.frame")

How to stack raw and creating a new variable in R

Here is a small sample of my data
AB AN AQ AP AA
1 O1 N 12 13
2 K1 B 22 16
I want to generate this table
AB AN AQ New AP
1 O1 N 1 12
1 O1 N 2 13
2 K1 B 1 22
2 K1 B 2 16
The logic is to stack the same data in AB, AN and AQ next generate a new colour which gets 1 and then AP.
Under this raw, the same data but the New column get 2 and then AA. So the new column is 1 and 2
An option with reshape from base R
names(df)[4:5] <- paste0("AP", 1:2)
reshape(df, direction = "long", varying = 4:5, sep= "", timevar = "New")
# AB AN AQ New AP id
#1.1 1 O1 N 1 12 1
#2.1 2 K1 B 1 22 2
#1.2 1 O1 N 2 13 1
#2.2 2 K1 B 2 16 2
data
df <- structure(list(AB = 1:2, AN = c("O1", "K1"), AQ = c("N", "B"),
AP = c(12L, 22L), AA = c(13L, 16L)),
class = "data.frame", row.names = c(NA, -2L))
You can get the data in long format and then generate a new column based on unique column values.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c(AP, AA),
values_to = 'AP',
names_to = 'New') %>%
mutate(New = match(New, unique(New)))
# AB AN AQ New AP
# <int> <chr> <chr> <int> <int>
#1 1 O1 N 1 12
#2 1 O1 N 2 13
#3 2 K1 B 1 22
#4 2 K1 B 2 16
data
df <- structure(list(AB = 1:2, AN = c("O1", "K1"), AQ = c("N", "B"),
AP = c(12L, 22L), AA = c(13L, 16L)),
class = "data.frame", row.names = c(NA, -2L))

Wide to long, combining columns in pairs but keeping ID column - R

I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))

Left_join fill NA entries with data values from the second dataframe

I have two fairly complicated data.frames and managed to simplify the first step of my problem here. I have a reference table and another that contains my data as follows:
REFERENCE
ref <- structure(list(group = c("A", "B", "C"), position = c("a", "a",
"b")), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
DATA
df <- structure(list(position = c("a", "a"), value = c(1, 1, 2), name = c("foo",
"bar")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"))
I used left_join(ref,df,by="position") %>% arrange(name) to obtain:
1 A a 1 foo
2 A a 1 bar
3 B a 1 foo
4 B a 1 bar
5 C b NA NA
The ideal output however is:
group position value name
<chr> <chr> <dbl> <chr>
1 A a 1 bar
2 B a 1 bar
3 C b 0 bar
4 A a 1 foo
5 B a 1 foo
6 C b 0 foo
I would like the name column to replace NA with the input from df and the value column's NA with 0. In the real df, I have more than foo in the name column
We could use crossing to get the combinations, then replace the 'value' column values to 0 where the 'position' columns are not equal
library(dplyr)
library(tidyr)
crossing(ref, df %>%
rename(position2 = position)) %>%
arrange(name) %>%
mutate(value = replace(value, position != position2 , 0)) %>%
select(-position2)
# A tibble: 6 x 4
# group position value name
# <chr> <chr> <dbl> <chr>
#1 A a 1 bar
#2 B a 1 bar
#3 C b 0 bar
#4 A a 1 foo
#5 B a 1 foo
#6 C b 0 foo

Reorder, exclude a column and keep others in R?

Here is my toy dataframe:
structure(list(a = c(1, 2), b = c(3, 4), c = c(5, 6), d = c(7,
8)), .Names = c("a", "b", "c", "d"), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
Now I want to reorder and exclude one the columns and keep the others:
df %>% select(-a, d, everything())
I want my df to be :
d b c
7 3 5
8 4 6
I get the following:
b c d a
<dbl> <dbl> <dbl> <dbl>
1 3 5 7 1
2 4 6 8 2
Keep the -a at the last in the select. Even though, we removed a in the beginning the everythig() at the end is still checking the column names of the whole dataset
df%>%
select(d, everything(), -a)
# A tibble: 2 x 3
# d b c
# <dbl> <dbl> <dbl>
#1 7 3 5
#2 8 4 6

Resources