I have a dataframe 'df' where I want to summarize how many times each 'user' has a higher 'total' value for each head-to-head 'game'. My data frame looks like this:
game
user
total
1
L
55
1
J
60
2
L
64
2
J
77
3
L
90
3
J
67
4
L
98
4
J
88
5
L
71
5
J
92
The summary would state that L had a higer total in 2 games and J had a higher total in 3 games.
Thank you!
Same approach as Vinay, using data.table
library(data.table)
setDT(df)
df[order(total), tail(.SD, 1), game][, .N, user]
#> user N
#> <char> <int>
#> 1: J 3
#> 2: L 2
Created on 2022-01-19 by the reprex package (v2.0.1)
Data used:
df <- structure(list(game = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L
), user = c("L", "J", "L", "J", "L", "J", "L", "J", "L", "J"),
total = c(55L, 60L, 64L, 77L, 90L, 67L, 98L, 88L, 71L, 92L
)), row.names = c(NA, -10L), class = "data.frame")
Assuming df is your dataframe the following should give you the long form summary.
df %>%
arrange(game,desc(total)) %>% #we sort descending to ensure winner row is first.
group_by(game) %>% # we group the rows per game, this allows for winner row to be first in each group
slice_head(n=1)%>% #get first row in each group i.e winner row
ungroup()
Output:
# A tibble: 5 × 3
game user total
<int> <chr> <int>
1 1 J 60
2 2 J 77
3 3 L 90
4 4 L 98
5 5 J 92
If you just want the user wise summary add count to the code as follows:
df %>%
arrange(game,desc(total)) %>% #we sort descending to ensure winner row is first.
group_by(game) %>% # we group the rows per game, this allows for winner row to be first in each group
slice_head(n=1) %>% #get first row in each group i.e winner row
ungroup() %>%
count(user)
Output:
# A tibble: 2 × 2
user n
<chr> <int>
1 J 3
2 L 2
We can group the data by game, slice_max and then count the resulting data.
library(tidyverse)
df %>% group_by(game) %>%
slice_max(total) %>%
ungroup() %>%
count(user)
#> # A tibble: 2 × 2
#> user n
#> <chr> <int>
#> 1 J 3
#> 2 L 2
Created on 2022-01-20 by the reprex package (v2.0.1)
Note that if there's a tie, it will add one to both teams:
library(tidyverse)
df <-
read_table('game user total
1 L 60
1 J 60
2 L 64
2 J 77
3 L 90
3 J 67
4 L 98
4 J 88
5 L 71
5 J 92')
df %>% group_by(game) %>%
slice_max(total) %>%
ungroup() %>%
count(user)
#> # A tibble: 2 × 2
#> user n
#> <chr> <int>
#> 1 J 3
#> 2 L 3
Created on 2022-01-20 by the reprex package (v2.0.1)
Is this the type of output you want?
library(tidyverse)
df <- structure(list(game = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L),
user = c("L", "J", "L", "J", "L", "J", "L", "J", "L", "J"),
total = c(55L, 60L, 64L, 77L, 90L, 67L, 98L, 88L, 71L, 92L)
), class = "data.frame", row.names = c(NA, -10L))
df %>%
group_by(game) %>%
slice_max(order_by = total,
n = 1,
with_ties = TRUE) %>%
group_by(user) %>%
summarise(wins = n())
#> # A tibble: 2 × 2
#> user wins
#> <chr> <int>
#> 1 J 3
#> 2 L 2
Created on 2022-01-20 by the reprex package (v2.0.1)
Edit
If you have a draw, then the above method counts that as a 'win' for both users. To count a draw as 'no winner' for both users (e.g. shown in game 1, below), you could use:
library(tidyverse)
df <- structure(list(game = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L),
user = c("L", "J", "L", "J", "L", "J", "L", "J", "L", "J"),
total = c(55L, 55L, 64L, 77L, 90L, 67L, 98L, 88L, 71L, 92L)
), class = "data.frame", row.names = c(NA, -10L))
df
#> game user total
#> 1 1 L 55
#> 2 1 J 55
#> 3 2 L 64
#> 4 2 J 77
#> 5 3 L 90
#> 6 3 J 67
#> 7 4 L 98
#> 8 4 J 88
#> 9 5 L 71
#> 10 5 J 92
df %>%
group_by(game) %>%
distinct(total, .keep_all = TRUE) %>%
filter(n() >= 2) %>%
slice_max(order_by = total,
n = 1,
with_ties = FALSE) %>%
group_by(user) %>%
summarise(win = n())
#> # A tibble: 2 × 2
#> user win
#> <chr> <int>
#> 1 J 2
#> 2 L 2
Created on 2022-01-20 by the reprex package (v2.0.1)
Related
I've been looking at the various answers for similar issues, but can't see anything that quite answers my problem.
I have a large data table
Number_X
Amount
1
100
2
100
1
100
3
100
1
100
2
100
I want to replace the amount with 50 for those rows where Number_X == 1.
I've tried
library(dplyr)
data <- data %>%
mutate(Amount = replace(Amount, Number_X == 1, 50))
but it doesn't change the value for Amount. How can I fix this?
# set as data.table
setDT(df)
# if then
df[ Number_X == 1, Amount := 50]
With large data, a data.table solution is most appropriate.
I don't see an issue with using replace() but you can also try to use if_else()
library(dplyr, warn.conflicts = FALSE)
data <- tibble(
Number_X = c(1L, 2L, 1L, 3L, 1L, 2L),
Amount = c(100L, 100L, 100L, 100L, 100L, 100L)
)
data %>%
mutate(Amount = replace(Amount, Number_X == 1, 50L))
#> # A tibble: 6 x 2
#> Number_X Amount
#> <int> <int>
#> 1 1 50
#> 2 2 100
#> 3 1 50
#> 4 3 100
#> 5 1 50
#> 6 2 100
data %>%
mutate(Amount = if_else(Number_X == 1, 50L, Amount))
#> # A tibble: 6 x 2
#> Number_X Amount
#> <int> <int>
#> 1 1 50
#> 2 2 100
#> 3 1 50
#> 4 3 100
#> 5 1 50
#> 6 2 100
Created on 2022-02-04 by the reprex package (v2.0.1)
Tip: Use dput() with your data to share it more easily:
dput(data)
#> structure(list(Number_X = c(1L, 2L, 1L, 3L, 1L, 2L), Amount = c(100L,
#> 100L, 100L, 100L, 100L, 100L)), class = c("tbl_df", "tbl", "data.frame"
#> ), row.names = c(NA, -6L))
If you want a tidyverse approach:
data %>%
mutate(Amount = ifelse(Number_X == 1, 50, Amount))
If you want almost the speed of data.table and the grammar of dplyr, you can consider dtplyr.
I have a dataframe df like so
height age dept
69 18 A
44 8 B
72 19 B
58 34 C
I want to one-hot encode only the factor variables (only dept is a factor). How can i do this?
Currently right now I'm selecting everything..
and getting this warning:
Warning message:
The following variables are not factor vectors and will be ignored: height, age
ohe <- df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect::everything()) %>%
recipes::prep() %>%
recipes::bake(df)
Use the where with is.factor instead of everything
library(dplyr)
df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect:::where(is.factor)) %>%
recipes::prep() %>%
recipes::bake(df)
-output
# A tibble: 4 × 4
height age dept_B dept_C
<int> <int> <dbl> <dbl>
1 69 18 0 0
2 44 8 1 0
3 72 19 1 0
4 58 34 0 1
data
df <- structure(list(height = c(69L, 44L, 72L, 58L), age = c(18L, 8L,
19L, 34L), dept = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
I have a dataframe such as
Groups NAMES start end
G1 A 1 50
G1 A 25 45
G1 B 20 51
G1 A 51 49
G2 A 200 400
G2 B 1 1600
G2 A 2000 3000
G2 B 4000 5000
and the idea is within each Groups to look at NAMES where start & end coordinates of A are within coordinates of B
for instance here in the example :
Groups NAMES start end
G1 A 1 50 <- A is outside any B coordinate
G1 A 25 45 <- A is **inside** the B coord `20-51`,then I remove these B row.
G1 B 20 51
G1 A 51 49 <- A is outside any B coordinate
G2 A 200 400 <- A is **inside** the B coordinate 1-1600, then I romove this B row.
G2 B 1 1600
G2 A 2000 3000 <- A is outside any B coordinate
G2 B 4000 5000 <- this one does not have any A inside it, then it will be kept in the output.
Then I should get as output :
Groups NAMES start end
G1 A 1 50
G1 A 25 45
G1 A 51 49
G2 A 200 400
G2 A 2000 3000
G2 B 4000 5000
Does someone have an idea please ?
Here is the dataframe in dput format if it can help you ? :
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("G1", "G2"), class = "factor"), NAMES = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("A", "B"), class = "factor"),
start = c(1L, 25L, 20L, 51L, 200L, 1L, 2000L, 4000L), end = c(50L,
45L, 51L, 49L, 400L, 1600L, 3000L, 5000L)), class = "data.frame", row.names = c(NA,
-8L))
Here's a possible approach. We'll split the df by NAMES and join the two parts to each other by Groups to do within-group comparisons. Only B rows can get dropped, so those are the only ones whose row numbers we want to keep track of.
We can then just group by rowid to tag the B rows by whether or not they have any A inside them. Finally, filter to the B to keep and concatenate back to the A rows.
library(tidyverse)
df <- structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("G1", "G2"), class = "factor"), NAMES = structure(c(1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("A", "B"), class = "factor"), start = c(1L, 25L, 20L, 51L, 200L, 1L, 2000L, 4000L), end = c(50L, 45L, 51L, 49L, 400L, 1600L, 3000L, 5000L)), class = "data.frame", row.names = c(NA, -8L))
A <- filter(df, NAMES == "A")
B <- df %>%
filter(NAMES == "B") %>%
rowid_to_column()
comparison <- inner_join(A, B, by = "Groups") %>%
mutate(A_in_B = start.x >= start.y & end.x <= end.y) %>%
group_by(rowid) %>%
summarise(keep_B = !any(A_in_B))
B %>%
inner_join(comparison, by = "rowid") %>%
filter(keep_B) %>%
select(-rowid, -keep_B) %>%
bind_rows(A) %>%
arrange(Groups, NAMES)
#> Groups NAMES start end
#> 1 G1 A 1 50
#> 2 G1 A 25 45
#> 3 G1 A 51 49
#> 4 G2 A 200 400
#> 5 G2 A 2000 3000
#> 6 G2 B 4000 5000
Created on 2021-07-27 by the reprex package (v1.0.0)
This will also do using purrr::map_dfr
library(tidyverse)
df %>%
group_split(Groups) %>%
map_dfr(~ .x %>% mutate(r = row_number()) %>%
full_join(.x %>%
filter(NAMES == 'B'),
by = 'Groups') %>%
group_by(r) %>%
filter(any(NAMES.x == 'B' | start.x > start.y & end.x < end.y)) %>%
ungroup %>%
select(Groups, ends_with('.x')) %>%
distinct %>%
rename_with(~ gsub('\\.x', '', .), everything())
)
#> # A tibble: 6 x 4
#> Groups NAMES start end
#> <fct> <fct> <int> <int>
#> 1 G1 A 25 45
#> 2 G1 B 20 51
#> 3 G1 A 51 49
#> 4 G2 A 200 400
#> 5 G2 B 1 1600
#> 6 G2 B 4000 5000
Created on 2021-07-27 by the reprex package (v2.0.0)
I have this dataframe in R
Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6
I want to aggregate it to where it will combined all the pros and anti based on party
for example
Party ProSum AntiSum
R. 234. 245
D. 234. 245
How would I do that in R?
You can use:
library(tidyverse)
df %>%
pivot_longer(-Party,
names_to = c(".value", NA),
names_pattern = "([a-zA-Z]*)([0-9]*)") %>%
group_by(Party) %>%
summarise(across(where(is.numeric), sum, na.rm = T))
# A tibble: 2 x 3
Party Pro Anti
<chr> <int> <int>
1 D 50 34
2 R 5 78
I would suggest a tidyverse approach reshaping the data and the computing the sum of values:
library(tidyverse)
#Data
df <- structure(list(Party = c("R", "R", "D", "D"), Pro2005 = c(1L,
1L, 13L, 12L), Anti2005 = c(18L, 19L, 7L, 8L), Pro2006 = c(0L,
0L, 3L, 3L), Anti2006 = c(7L, 7L, 4L, 4L), Pro2007 = c(2L, 1L,
10L, 9L), Anti2007 = c(13L, 14L, 5L, 6L)), class = "data.frame", row.names = c(NA,
-4L))
The code:
df %>% pivot_longer(cols = -1) %>%
#Format strings
mutate(name=gsub('\\d+','',name)) %>%
#Aggregate
group_by(Party,name) %>% summarise(value=sum(value,na.rm=T)) %>%
pivot_wider(names_from = name,values_from=value)
The output:
# A tibble: 2 x 3
# Groups: Party [2]
Party Anti Pro
<chr> <int> <int>
1 D 34 50
2 R 78 5
Splitting by parties and loop sum over the pro/anti using sapply, finally rbind.
res <- data.frame(Party=sort(unique(d$Party)), do.call(rbind, by(d, d$Party, function(x)
sapply(c("Pro", "Anti"), function(y) sum(x[grep(y, names(x))])))))
res
# Party Pro Anti
# D D 50 34
# R R 5 78
An outer solution is also suitable.
t(outer(c("Pro", "Anti"), c("R", "D"),
Vectorize(function(x, y) sum(d[d$Party %in% y, grep(x, names(d))]))))
# [,1] [,2]
# [1,] 5 78
# [2,] 50 34
Data:
d <- read.table(header=T, text="Party Pro2005 Anti2005 Pro2006 Anti2006 Pro2007 Anti2007
R 1 18 0 7 2 13
R 1 19 0 7 1 14
D 13 7 3 4 10 5
D 12 8 3 4 9 6 ")
I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))