aggregate and removes duplicates for some rows

aggregate and removes duplicates for some rows - r

I have a dataset like
df <- data.frame(id = c("a","a","b","b","c","d","e","f"),
val = c(1,2,3,4,5,6,7,8),
extracol = c("x",NA,"y","z","t","v","u","p"))
id val extracol
1 a 1 x
2 a 2 <NA>
3 b 3 y
4 b 4 z
5 c 5 t
6 d 6 v
7 e 7 u
8 f 8 p
and I want to sum (and aggregate) the values according to the column id but only for "a". So I want to get something like:
id val extracol
1 a 3 x
2 b 3 y
3 b 4 z
4 c 5 t
5 d 6 v
6 e 7 u
7 f 8 p
I really don't care if I get "x" or NA in the extracol. Any suggestion?

This would work:
library(dplyr)
df <- data.frame(id = c("a","a","b","b","c","d","e","f"),
val = c(1,2,3,4,5,6,7,8),
extracol = c("x",NA,"y","z","t","v","u","p"))
# keep only a
a = df%>% filter(id == "a")
# aggregate a
a_agg= a %>% group_by(id) %>% summarise(val = sum(val), extracol = first(extracol))
# drop a
df = df %>% filter(id != "a")
# append a
df = rbind(df, a_agg)
df
id val extracol
1 b 3 y
2 b 4 z
3 c 5 t
4 d 6 v
5 e 7 u
6 f 8 p
7 a 3 x

A base R option
with(
df,
rbind(
data.frame(
id = "a",
val = sum(val[id == "a"]),
extracol = na.omit(extracol[id == "a"])
),
df[id != "a", ]
)
)
gives
id val extracol
1 a 3 x
3 b 3 y
4 b 4 z
5 c 5 t
6 d 6 v
7 e 7 u
8 f 8 p

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result

Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))

We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4

Extract rows where value appears in any of multiple columns

Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)

We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')

Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows

This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")

Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows

A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...

Data.frame transform data with criteria from another data.frame like Excel "criteria table"

I have data.frame df1 and a data.frame df2. How do I use df2 to mutate/transform df1 to merged data.frame where: column name will be filled with the value on df2$name if df1$id >= df2$start and <= df2$end.
df1 = data.frame(id = 1:10, c = letters[1:10])
df2 = data.frame(name = LETTERS[1:3], start = c(2, 5, 8), end = c(4,7, 9))
merged = data.frame(id = df1$id, c = df1$c, name = c(NA, "A", "A", "A", "B", "B", "B", "C", "C", NA) )
Visually:
> df1
id c
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 f
7 7 g
8 8 h
9 9 i
10 10 j
> df2
name start end
1 A 2 4
2 B 5 7
3 C 8 9
> merged
id c name
1 1 a <NA>
2 2 b A
3 3 c A
4 4 d A
5 5 e B
6 6 f B
7 7 g B
8 8 h C
9 9 i C
10 10 j <NA>

We can use non-equi join with data.table and assign a new column with the corresponding values of 'name' where the conditional join is met
library(data.table)
setDT(df1)[df2, cn := name, on = .(id > start, id <= end)]
df1
# id c cn
# 1: 1 a <NA>
# 2: 2 b <NA>
# 3: 3 c A
# 4: 4 d A
# 5: 5 e <NA>
# 6: 6 f B
# 7: 7 g B
# 8: 8 h <NA>
# 9: 9 i C
#10: 10 j <NA>
Or another option is fuzzyjoin
library(fuzzyjoin)
library(dplyr)
fuzzy_left_join(df1, df2, by = c('id' = 'start', 'id' = 'end'),
match_fun = list(`>`, `<=`)) %>%
select(id, c, cn = name)

filter one dataframe via conditions in another

I want to recursively filter a dataframe, d by an arbitrary number of conditions (represented as rows in another dataframe z).
I begin with a dataframe d:
d <- data.frame(x = 1:10, y = letters[1:10])
The second dataframe z, has columns x1 and x2, which are lower and upper limits to filter d$x. This dataframe z may grow to be an arbitrary number of rows long.
z <- data.frame(x1 = c(1,3,8), x2 = c(1,4,10))
I want to return all rows of d for which d$x <= z$x1[i] and d$x >= z$x2[i] for all i, where i = nrow(z).
So for this toy example, exclude everything from 1:1, 3:4, 8:10, inclusive.
x y
2 2 b
5 5 e
6 6 f
7 7 g

We can create a sequence between x1 and x2 values and use anti_join to select rows from d that are not present in z.
library(tidyverse)
remove <- z %>%
mutate(x = map2(x1, x2, seq)) %>%
unnest(x) %>%
select(x)
anti_join(d, remove)
# x y
#1 2 b
#2 5 e
#3 6 f
#4 7 g

We can use a non-equi join
library(data.table)
i1 <- setDT(d)[z, .I, on = .(x >=x1, x <= x2), by = .EACHI]$I
i1
#[1] 1 3 4 8 9 10
d[i1]
# x y
#1: 1 a
#2: 3 c
#3: 4 d
#4: 8 h
#5: 9 i
#6: 10 j
d[!i1]
# x y
#1: 2 b
#2: 5 e
#3: 6 f
#4: 7 g
Or using fuzzyjoin
library(fuzzyjoin)
library(dplyr)
fuzzy_inner_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 6 x 2
# x y
# <int> <fct>
#1 1 a
#2 3 c
#3 4 d
#4 8 h
#5 9 i
#6 10 j
Or to get the rows not in 'x' from 'd'
fuzzy_anti_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 4 x 2
# x y
# <int> <fct>
#1 2 b
#2 5 e
#3 6 f
#4 7 g

How to add a row to data frame based on a condition

I've a dataframe which I want to add a row on the basis of the following conditions. The conditions are when column a is equal to C and column b is equal to 3 or 5.
Here is my dataframe
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = TRUE)
Whenever the condition is TRUE I want to add a row below where the condition is met add 3. I have tried the following
rbind(df, data.frame(a="add", b = "3"))
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 D 4
# 5 C 5
# 6 A 6
# 7 C 7
# 8 E 8
# 9 add 3
This is not the output I want. The output I want is
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8
How can I do that? I am new to R and thank you for your help.

lens = ifelse(df$b %in% c(3, 5) & df$a == "C", 2, 1)
ind = rep(1:NROW(df), lens)
df2 = df[ind,]
df2$a = as.character(df2$a)
df2$a[cumsum(lens)[which(lens == 2)]] = "add"
df2$b[cumsum(lens)[which(lens == 2)]] = 3
df2
# a b
#1 A 1
#2 B 2
#3 C 3
#3.1 add 3
#4 D 4
#5 C 5
#5.1 add 3
#6 A 6
#7 C 7
#8 E 8

A solution using the tidyverse package.
library(tidyverse)
df2 <- df %>%
mutate(Group = lag(cumsum(a == "C" & b %in% c(3, 5)), default = FALSE)) %>%
group_split(Group) %>%
map_dfr(~ .x %>% bind_rows(tibble(a = "add", b = 3))) %>%
slice(-n()) %>%
select(-Group)
df2
# # A tibble: 10 x 2
# a b
# <chr> <dbl>
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8

In base R, we can find out position where a = "c" and b is 3 or 5. Repeat those rows in the dataframe and replace them with required values.
pos <- which(df$a == "C" & df$b %in% c(3, 5))
df <- df[sort(c(seq(nrow(df)), pos)), ]
df[seq_along(pos) + pos, ] <- list("add", 3)
row.names(df) <- NULL
df
# a b
#1 A 1
#2 B 2
#3 C 3
#4 add 3
#5 D 4
#6 C 5
#7 add 3
#8 A 6
#9 C 7
#10 E 8
data
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = FALSE)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

aggregate and removes duplicates for some rows - r

A base R option with( df, rbind( data.frame( id = "a", val = sum(val[id == "a"]), extracol = na.omit(extracol[id == "a"]) ), df[id != "a", ] ) ) gives id val extracol 1 a 3 x 3 b 3 y 4 b 4 z 5 c 5 t 6 d 6 v 7 e 7 u 8 f 8 p

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

Extract rows where value appears in any of multiple columns

Data.frame transform data with criteria from another data.frame like Excel "criteria table"

filter one dataframe via conditions in another

How to add a row to data frame based on a condition

Categories

Resources