summing up values in R dataframe without aggregation - r

How can I add up row values in a dataframe based on conditions without having to aggregate the whole table?
I have this df:
town party votes
1 a A 1
2 a B 2
3 a C 3
4 b A 4
5 b B 5
6 b C 6
7 c A 7
8 c B 8
9 c C 9
I would like to add the votes of one party to those of another by town, without touching the values of the third one.
Basically to run df$votes[df$party == A] = df$votes[df$party == A] + df$votes[df$party == B] for each category of df$town

I'm interpreting your pseudo-code as wanting to only update party "A" to the sum of both "A" and "B"'s votes.
base R
do.call(rbind, by(df, df$town,
function(Z) {
ind <- Z$party %in% c("A", "B")
Z$votes[Z$party == "A"] <- sum(Z$votes[ind])
Z
}
))
# town party votes
# a.1 a A 3
# a.2 a B 2
# a.3 a C 3
# b.4 b A 9
# b.5 b B 5
# b.6 b C 6
# c.7 c A 15
# c.8 c B 8
# c.9 c C 9
dplyr
library(dplyr)
df %>%
group_by(town) %>%
mutate(
votes = if_else(party == "A", sum(votes[party %in% c("A", "B")]), votes)
) %>%
ungroup()
# # A tibble: 9 x 3
# town party votes
# <chr> <chr> <int>
# 1 a A 3
# 2 a B 2
# 3 a C 3
# 4 b A 9
# 5 b B 5
# 6 b C 6
# 7 c A 15
# 8 c B 8
# 9 c C 9
data.table
library(data.table)
DT <- as.data.table(df) # normally setDT(df) is canonical
DT[, votes := fifelse(party == "A", sum(votes[party %in% c("A", "B")]), votes),
by = .(town)]
# town party votes
# <char> <char> <int>
# 1: a A 3
# 2: a B 2
# 3: a C 3
# 4: b A 9
# 5: b B 5
# 6: b C 6
# 7: c A 15
# 8: c B 8
# 9: c C 9

You can try mutate with dplyr if you want to keep the structure of the dataframe
library(dplyr)
df %>%
group_by(town) %>%
mutate(sum=ifelse(party!="C", sum(votes[party!="C"]), votes)) %>%
ungroup()
# A tibble: 9 × 4
town party votes sum
<chr> <chr> <int> <int>
1 a A 1 3
2 a B 2 3
3 a C 3 3
4 b A 4 9
5 b B 5 9
6 b C 6 6
7 c A 7 15
8 c B 8 15
9 c C 9 9
Another way using summarise
df %>%
filter(party!="C") %>%
group_by(town) %>%
summarise(sum=sum(votes))
# A tibble: 3 × 2
town sum
<chr> <int>
1 a 3
2 b 9
3 c 15

tidyverse
df <- data.frame(
stringsAsFactors = FALSE,
town = c("a", "a", "a", "b", "b", "b", "c", "c", "c"),
party = c("A", "B", "C", "A", "B", "C", "A", "B", "C"),
votes = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)
)
library(tidyverse)
df %>%
group_by(town, grp_party = party %in% c("A", "B")) %>%
mutate(new_party = paste0(party, collapse = ""), new_votes = sum(votes)) %>%
ungroup() %>%
select(-grp_party)
#> # A tibble: 9 x 5
#> town party votes new_party new_votes
#> <chr> <chr> <int> <chr> <int>
#> 1 a A 1 AB 3
#> 2 a B 2 AB 3
#> 3 a C 3 C 3
#> 4 b A 4 AB 9
#> 5 b B 5 AB 9
#> 6 b C 6 C 6
#> 7 c A 7 AB 15
#> 8 c B 8 AB 15
#> 9 c C 9 C 9
Created on 2022-02-08 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, votes:= lapply(.SD, sum), by = list(town, party %in% c("A", "B"))][]
#> town party votes
#> 1: a A 3
#> 2: a B 3
#> 3: a C 3
#> 4: b A 9
#> 5: b B 9
#> 6: b C 6
#> 7: c A 15
#> 8: c B 15
#> 9: c C 9
Created on 2022-02-08 by the reprex package (v2.0.1)

Related

How to create unique data frame rows from inputs of 2 lists

In this example I have a list with 4 values (Lot) and another with 3 (Method).
Lot <- c("A", "B", "C", "D")
Method <- c(1,2,3)
I need to create a data frame with a Lot and Method column where their values are repeated so each row is unique. I need it to look like this:
# Lot Method
# 1 A 1
# 2 A 2
# 3 A 3
# 4 B 1
# 5 B 2
# 6 B 3
# 7 C 1
# 8 C 2
# 9 C 3
# 10 D 1
# 11 D 2
# 12 D 3
How can this be done without creating 2 long repetitive lists like this:
Lot <- c("A","A","A", "B","B","B","C","C","C","D","D","D")
Method <- c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3)
Using expand.grid you could do
expand.grid(
Lot = c("A", "B", "C", "D"),
Method = c(1,2,3)
)
#> Lot Method
#> 1 A 1
#> 2 B 1
#> 3 C 1
#> 4 D 1
#> 5 A 2
#> 6 B 2
#> 7 C 2
#> 8 D 2
#> 9 A 3
#> 10 B 3
#> 11 C 3
#> 12 D 3
Or to get the right order we could do (thanks to #onyambu for pointing that out):
rev(expand.grid(
Method = c(1,2,3),
Lot = c("A", "B", "C", "D")
))
#> Lot Method
#> 1 A 1
#> 5 A 2
#> 9 A 3
#> 2 B 1
#> 6 B 2
#> 10 B 3
#> 3 C 1
#> 7 C 2
#> 11 C 3
#> 4 D 1
#> 8 D 2
#> 12 D 3
Or using the tidyverse you could do:
tidyr::expand_grid(
Lot = c("A", "B", "C", "D"),
Method = c(1,2,3)
)
#> # A tibble: 12 × 2
#> Lot Method
#> <chr> <dbl>
#> 1 A 1
#> 2 A 2
#> 3 A 3
#> 4 B 1
#> 5 B 2
#> 6 B 3
#> 7 C 1
#> 8 C 2
#> 9 C 3
#> 10 D 1
#> 11 D 2
#> 12 D 3
With data.table, we can use CJ:
library(data.table)
CJ(Lot = c("A", "B", "C", "D"),
Method = c(1, 2, 3))
Output
Lot Method
1: A 1
2: A 2
3: A 3
4: B 1
5: B 2
6: B 3
7: C 1
8: C 2
9: C 3
10: D 1
11: D 2
12: D 3

Creating new row values based on second column in R

I want to create a new variable called "X" whic is the sum of "B" and "D"
type <- c( "A", "B","C","D","E")
cnt <- c(2,5,3,7,8)
df <- data.frame(type,cnt)
> df
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
The desired output is
> df
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
How could extend this, if we add another grouping variable like date.
Would like to add up X for each day
date <- c("2022-01-01","2022-01-01","2022-01-01","2022-01-01","2022-01-01","2022-01-02","2022-01-02","2022-01-02","2022-01-02","2022-01-02")
type <- c("A", "B","C","D","E","A", "B","C","D","E")
cnt <- c(2,5,3,7,8, 1,9,8,2,5)
df <- data.frame(date,type,cnt)
df
date type cnt
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-02 A 1
7 2022-01-02 B 9
8 2022-01-02 C 8
9 2022-01-02 D 2
10 2022-01-02 E 5
Desired output is
df
date type cnt
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11
You could also use:
df %>%
add_row(type= 'X', cnt = sum(.$cnt[.$type %in% c('B', 'D')]))
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
UPDATE:
df %>%
group_by(date)%>%
group_modify(~add_row(.,type = 'X',
cnt = sum(.$cnt[.$type%in%c('B', 'D')])))
# A tibble: 12 x 3
# Groups: date [2]
date type cnt
<chr> <chr> <int>
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11
We can subset and rbind
rbind(df, data.frame(type = "X", cnt = sum(df$cnt[df$type %in% c("B", "D")])))
-output
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Or in dplyr, filter the rows based on the 'type' values, summarise by taking the sum of 'cnt', while creating 'type' as 'X' and use bind_rows with original dataset
library(dplyr)
df %>%
filter(type %in% c("B", "D")) %>%
summarise(type = 'X', cnt = sum(cnt)) %>%
bind_rows(df, .)
Or without using bind_rows
df %>%
summarise(type = c(type, 'X'), cnt = c(cnt, sum(cnt[type %in% c("B", "D")])))
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Or using complete
library(tidyr)
complete(df, type = c(type, "X"), fill = list(cnt = sum(cnt[type %in% c("B", "D")])))
# A tibble: 6 × 2
type cnt
<chr> <dbl>
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Update
For the updated data, just add a group_by
df %>%
group_by(date) %>%
summarise(type = c(type, "X"),
cnt = c(cnt, sum(cnt[type %in% c("B", "D")])), .groups = 'drop')
-output
# A tibble: 12 × 3
date type cnt
<chr> <chr> <dbl>
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11
Or using the filter approach
df %>%
filter(type %in% c("B", "D")) %>%
group_by(date) %>%
summarise(type = 'X', cnt = sum(cnt), .groups = 'drop') %>%
bind_rows(df, .) %>%
arrange(date)
Another possible solution, in base R:
rbind(df, c(type = "X", sum(ifelse(type %in% c("B", "D"), cnt, 0))))
#> type cnt
#> 1 A 2
#> 2 B 5
#> 3 C 3
#> 4 D 7
#> 5 E 8
#> 6 X 12
With dplyr:
bind_rows(df, list(type = "X", cnt = sum(if_else(type %in% c("B","D"), cnt, 0))))
Here is an alternative dplyr in combination with janitor package:
df %>%
filter(type == "B" |type == "D") %>%
adorn_totals(name="X") %>%
filter(type == "X") %>%
bind_rows(df) %>%
arrange(cnt)
type cnt
A 2
C 3
B 5
D 7
E 8
X 12

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Merging datasets by id and maintain one row for each id

I am attempting to merge 2 datasets belonging to a single id with a larger dataset.
However, I am having trouble merging the two single row datasets into a single row within the larger dataset.
Is there a simple way to merge with dplyr and only overwrite values if they are NA's?
My data:
df1 <- data.frame(id=1:5, b=6:10, c=c("a", "b", "c", "d", "e"), d=c(NA, 1,2,3, 4))
df2 <- data.frame(id=6, b=2, c="f", d=NA_real_)
df3 <- data.frame(id=6, b=NA_real_, c=NA_character_, d=5, e="a")
> df1
id b c d
1 1 6 a NA
2 2 7 b 1
3 3 8 c 2
4 4 9 d 3
5 5 10 e 4
> df2
id b c d
1 6 2 f NA
> df3
id b c d e
1 6 NA <NA> 5 a
My attempt:
merge1 <- dplyr::full_join(df1, df2) %>% full_join(df3)
Desired output:
output <- data.frame(id=1:6, b=c(6:10,2), c=c("a", "b", "c", "d", "e", "f"), d=c(NA, 1,2,3, 4, 5), e=c(NA,NA, NA, NA, NA, "a"))
> output
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
As opposed to:
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f NA <NA>
7 6 NA <NA> 5 a
Thank you
You can try:
list(df1, df2, df3) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(~ first(na.omit(.)))
id b c d e
<dbl> <dbl> <chr> <dbl> <fct>
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
you can try
library(tidyverse)
df1 %>%
mutate_if(is.factor, as.character) %>%
bind_rows(mutate_if(df2, is.factor, as.character)) %>%
left_join(select(df3, id, d, e), by = "id") %>%
mutate(d= ifelse(is.na(d.x), d.y, d.x)) %>%
select(-d.x, -d.y)

Filtering the column and creating a new column by grouping the data and selecting a min of a column

Business understanding: Fliter the data using
Dataset_z$DayCount <= Dataset_z$t & Dataset_z$DayCount >= Dataset_z$Total.LT
then create a new column in Dataset_z by following condition
group_by(Dataset_z$Source,Dataset_z$Plant, Dataset_z$Material) %>%
mutate(Dataset_z$A = min(NET.INV))
please help me
I created some sample data by doing:
library(dplyr)
Dataset_z <- data.frame(Total.LT = sample(1:6,10,replace=T),
DayCount = sample(1:10,10,replace=T),
t = sample(1:6,10,replace=T),
NET.INV = sample(1:20, 10, replace = T),
Source = sample(c("a", "b", "c", "d", "e"),10,replace=T),
Plant = sample(c("a", "b", "c", "d", "e"),10,replace=T),
Material = sample(c("a", "b", "c", "d", "e"),10,replace=T), stringsAsFactors = F)
DataSet_z looks like:
Total.LT DayCount t NET.INV Source Plant Material
1 5 6 6 20 a c b
2 3 2 6 9 b c b
3 4 4 2 5 e e d
4 3 3 2 12 a c a
5 1 8 2 9 c d a
6 2 2 2 16 c b d
7 4 8 1 15 c b a
8 3 5 6 6 d a e
9 5 3 6 11 c b a
10 4 5 5 1 c e c
Doing the filter and grouping you do not need to use the $ operator because dplyr attaches the columns using the %>% operator.
Dataset_z %>% filter(DayCount >= Total.LT & DayCount <= t) %>% group_by(Source, Plant, Material) %>% mutate(A = min(NET.INV))
This results in:
Total.LT DayCount t NET.INV Source Plant Material A
<int> <int> <int> <int> <chr> <chr> <chr> <dbl>
1 5 6 6 20 a c b 20
2 2 2 2 16 c b d 16
3 3 5 6 6 d a e 6
4 4 5 5 1 c e c 1

Resources