Creating new row values based on second column in R - r

I want to create a new variable called "X" whic is the sum of "B" and "D"
type <- c( "A", "B","C","D","E")
cnt <- c(2,5,3,7,8)
df <- data.frame(type,cnt)
> df
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
The desired output is
> df
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
How could extend this, if we add another grouping variable like date.
Would like to add up X for each day
date <- c("2022-01-01","2022-01-01","2022-01-01","2022-01-01","2022-01-01","2022-01-02","2022-01-02","2022-01-02","2022-01-02","2022-01-02")
type <- c("A", "B","C","D","E","A", "B","C","D","E")
cnt <- c(2,5,3,7,8, 1,9,8,2,5)
df <- data.frame(date,type,cnt)
df
date type cnt
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-02 A 1
7 2022-01-02 B 9
8 2022-01-02 C 8
9 2022-01-02 D 2
10 2022-01-02 E 5
Desired output is
df
date type cnt
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11

You could also use:
df %>%
add_row(type= 'X', cnt = sum(.$cnt[.$type %in% c('B', 'D')]))
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
UPDATE:
df %>%
group_by(date)%>%
group_modify(~add_row(.,type = 'X',
cnt = sum(.$cnt[.$type%in%c('B', 'D')])))
# A tibble: 12 x 3
# Groups: date [2]
date type cnt
<chr> <chr> <int>
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11

We can subset and rbind
rbind(df, data.frame(type = "X", cnt = sum(df$cnt[df$type %in% c("B", "D")])))
-output
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Or in dplyr, filter the rows based on the 'type' values, summarise by taking the sum of 'cnt', while creating 'type' as 'X' and use bind_rows with original dataset
library(dplyr)
df %>%
filter(type %in% c("B", "D")) %>%
summarise(type = 'X', cnt = sum(cnt)) %>%
bind_rows(df, .)
Or without using bind_rows
df %>%
summarise(type = c(type, 'X'), cnt = c(cnt, sum(cnt[type %in% c("B", "D")])))
type cnt
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Or using complete
library(tidyr)
complete(df, type = c(type, "X"), fill = list(cnt = sum(cnt[type %in% c("B", "D")])))
# A tibble: 6 × 2
type cnt
<chr> <dbl>
1 A 2
2 B 5
3 C 3
4 D 7
5 E 8
6 X 12
Update
For the updated data, just add a group_by
df %>%
group_by(date) %>%
summarise(type = c(type, "X"),
cnt = c(cnt, sum(cnt[type %in% c("B", "D")])), .groups = 'drop')
-output
# A tibble: 12 × 3
date type cnt
<chr> <chr> <dbl>
1 2022-01-01 A 2
2 2022-01-01 B 5
3 2022-01-01 C 3
4 2022-01-01 D 7
5 2022-01-01 E 8
6 2022-01-01 X 12
7 2022-01-02 A 1
8 2022-01-02 B 9
9 2022-01-02 C 8
10 2022-01-02 D 2
11 2022-01-02 E 5
12 2022-01-02 X 11
Or using the filter approach
df %>%
filter(type %in% c("B", "D")) %>%
group_by(date) %>%
summarise(type = 'X', cnt = sum(cnt), .groups = 'drop') %>%
bind_rows(df, .) %>%
arrange(date)

Another possible solution, in base R:
rbind(df, c(type = "X", sum(ifelse(type %in% c("B", "D"), cnt, 0))))
#> type cnt
#> 1 A 2
#> 2 B 5
#> 3 C 3
#> 4 D 7
#> 5 E 8
#> 6 X 12
With dplyr:
bind_rows(df, list(type = "X", cnt = sum(if_else(type %in% c("B","D"), cnt, 0))))

Here is an alternative dplyr in combination with janitor package:
df %>%
filter(type == "B" |type == "D") %>%
adorn_totals(name="X") %>%
filter(type == "X") %>%
bind_rows(df) %>%
arrange(cnt)
type cnt
A 2
C 3
B 5
D 7
E 8
X 12

Related

Get mean column values every n rows grouped by value in other column

I have a dataframe df that looks like this
time object
1 1 A
2 2 A
3 3 A
4 4 A
5 5 A
6 6 A
7 7 B
8 8 B
9 9 B
10 10 B
11 11 B
12 12 C
13 13 C
14 14 C
15 15 C
16 16 C
17 17 C
18 18 C
I would like to get the mean of the timecolumn every 3 rows based on the object column
df_mean
time object
1 2 A
2 5 A
3 8 B
4 13 C
5 16 C
I though about using dplyr
df%>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(grp) %>%
summarise(across(c("time"), mean, na.rm = TRUE)) %>%
select(-grp)
but I do not know how to integrate the control for the object.
Another option would be to use aggregate
aggregate(.~object, data=df, mean)
but in this case I do not know how to get the mean every 3 rows.
Your dplyr attempt is on the right track. With a few modifications it will work.
library(dplyr)
df <- tibble(time = 1:18, object = rep(c('A', 'B', 'C'), each = 6))
df %>%
group_by(object, grp = (row_number()-1) %/% 3) %>%
summarise(across(time, mean, na.rm = T), .groups = 'drop') %>%
select(-grp)
#> # A tibble: 6 × 2
#> object time
#> <chr> <dbl>
#> 1 A 2
#> 2 A 5
#> 3 B 8
#> 4 B 11
#> 5 C 14
#> 6 C 17
Here is an alternative dplyr way:
library(dplyr)
n = 3
df %>%
group_by(object, Col2 = rep(row_number(), each=n, length.out = n())) %>%
summarise(time = mean(time, na.rm = TRUE)) %>%
select(-Col2)
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 B 11
5 C 14
6 C 17
You can do a slight modification, creating your grp variable within object group first, and then filtering where the size of the joint grouping is >=3, and then summarize:
df%>%
group_by(object) %>%
mutate(grp = 1+ (row_number()-1) %/% 3) %>%
group_by(object,grp) %>%
filter(n()>=3) %>%
summarize(time=mean(time), .groups="drop") %>%
select(-grp)
Output:
object time
<chr> <dbl>
1 A 2
2 A 5
3 B 8
4 C 13
5 C 16
data.table option:
library(data.table)
setDT(df)
df[, n3:=gl(.N, 3, length=.N), by=object]
df[, .(time=mean(time)), by=.(object, n3)][, !"n3"]
Output:
object time
1: A 2
2: A 5
3: B 8
4: B 11
5: C 14
6: C 17
in base R:
aggregate(time~., cbind(df, gr=gl(nrow(df),3, nrow(df))), mean)
object gr time
1 A 1 2
2 A 2 5
3 B 3 8
4 B 4 11
5 C 5 14
6 C 6 17

summing up values in R dataframe without aggregation

How can I add up row values in a dataframe based on conditions without having to aggregate the whole table?
I have this df:
town party votes
1 a A 1
2 a B 2
3 a C 3
4 b A 4
5 b B 5
6 b C 6
7 c A 7
8 c B 8
9 c C 9
I would like to add the votes of one party to those of another by town, without touching the values of the third one.
Basically to run df$votes[df$party == A] = df$votes[df$party == A] + df$votes[df$party == B] for each category of df$town
I'm interpreting your pseudo-code as wanting to only update party "A" to the sum of both "A" and "B"'s votes.
base R
do.call(rbind, by(df, df$town,
function(Z) {
ind <- Z$party %in% c("A", "B")
Z$votes[Z$party == "A"] <- sum(Z$votes[ind])
Z
}
))
# town party votes
# a.1 a A 3
# a.2 a B 2
# a.3 a C 3
# b.4 b A 9
# b.5 b B 5
# b.6 b C 6
# c.7 c A 15
# c.8 c B 8
# c.9 c C 9
dplyr
library(dplyr)
df %>%
group_by(town) %>%
mutate(
votes = if_else(party == "A", sum(votes[party %in% c("A", "B")]), votes)
) %>%
ungroup()
# # A tibble: 9 x 3
# town party votes
# <chr> <chr> <int>
# 1 a A 3
# 2 a B 2
# 3 a C 3
# 4 b A 9
# 5 b B 5
# 6 b C 6
# 7 c A 15
# 8 c B 8
# 9 c C 9
data.table
library(data.table)
DT <- as.data.table(df) # normally setDT(df) is canonical
DT[, votes := fifelse(party == "A", sum(votes[party %in% c("A", "B")]), votes),
by = .(town)]
# town party votes
# <char> <char> <int>
# 1: a A 3
# 2: a B 2
# 3: a C 3
# 4: b A 9
# 5: b B 5
# 6: b C 6
# 7: c A 15
# 8: c B 8
# 9: c C 9
You can try mutate with dplyr if you want to keep the structure of the dataframe
library(dplyr)
df %>%
group_by(town) %>%
mutate(sum=ifelse(party!="C", sum(votes[party!="C"]), votes)) %>%
ungroup()
# A tibble: 9 × 4
town party votes sum
<chr> <chr> <int> <int>
1 a A 1 3
2 a B 2 3
3 a C 3 3
4 b A 4 9
5 b B 5 9
6 b C 6 6
7 c A 7 15
8 c B 8 15
9 c C 9 9
Another way using summarise
df %>%
filter(party!="C") %>%
group_by(town) %>%
summarise(sum=sum(votes))
# A tibble: 3 × 2
town sum
<chr> <int>
1 a 3
2 b 9
3 c 15
tidyverse
df <- data.frame(
stringsAsFactors = FALSE,
town = c("a", "a", "a", "b", "b", "b", "c", "c", "c"),
party = c("A", "B", "C", "A", "B", "C", "A", "B", "C"),
votes = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)
)
library(tidyverse)
df %>%
group_by(town, grp_party = party %in% c("A", "B")) %>%
mutate(new_party = paste0(party, collapse = ""), new_votes = sum(votes)) %>%
ungroup() %>%
select(-grp_party)
#> # A tibble: 9 x 5
#> town party votes new_party new_votes
#> <chr> <chr> <int> <chr> <int>
#> 1 a A 1 AB 3
#> 2 a B 2 AB 3
#> 3 a C 3 C 3
#> 4 b A 4 AB 9
#> 5 b B 5 AB 9
#> 6 b C 6 C 6
#> 7 c A 7 AB 15
#> 8 c B 8 AB 15
#> 9 c C 9 C 9
Created on 2022-02-08 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, votes:= lapply(.SD, sum), by = list(town, party %in% c("A", "B"))][]
#> town party votes
#> 1: a A 3
#> 2: a B 3
#> 3: a C 3
#> 4: b A 9
#> 5: b B 9
#> 6: b C 6
#> 7: c A 15
#> 8: c B 15
#> 9: c C 9
Created on 2022-02-08 by the reprex package (v2.0.1)

grouping to aggregate values, but tripping up on NA's

I have long data, and I am trying to make a new variable (consistent) that is the value for a given column (VALUE), for each person (ID), at TIME = 2. I used the code below to do this, but I am getting tripped up on NA's. If the VALUE for TIME = 2 is NA, then I want it to grab the VALUE at TIME = 1 instead. That part I'm not sure how to do. So, in the example below, I want the new variable (consistent) should be 10 instead of NA.
ID = c("A", "A", "B", "B", "C", "C", "D", "D")
TIME = c(1, 2, 1, 2, 1, 2, 1, 2)
VALUE = c(8, 9, 10, NA, 12, 13, 14, 9)
df = data.frame(ID, TIME, VALUE)
df <- df %>%
group_by(ID) %>%
mutate(consistent = VALUE[TIME == 2]) %>% ungroup
df
If we want to use the same code, then coalesce with the 'VALUE' where 'TIME' is 1 (assuming there is a single observation of 'TIME' for each 'ID')
library(dplyr)
df %>%
group_by(ID) %>%
mutate(consistent = coalesce(VALUE[TIME == 2], VALUE[TIME == 1])) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 9
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 13
6 C 2 13 13
7 D 1 14 9
8 D 2 9 9
Or another option is to arrange before doing the group_by and get the first element of 'VALUE' (assuming no replicating for 'TIME')
df %>%
arrange(ID, is.na(VALUE), desc(TIME)) %>%
group_by(ID) %>%
mutate(consistent = first(VALUE)) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 2 9 9
2 A 1 8 9
3 B 1 10 10
4 B 2 NA 10
5 C 2 13 13
6 C 1 12 13
7 D 2 9 9
8 D 1 14 9
Another possible solution, using tidyr::fill:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(consistent = VALUE) %>% fill(consistent) %>% ungroup
#> # A tibble: 8 × 4
#> ID TIME VALUE consistent
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 1 8 8
#> 2 A 2 9 9
#> 3 B 1 10 10
#> 4 B 2 NA 10
#> 5 C 1 12 12
#> 6 C 2 13 13
#> 7 D 1 14 14
#> 8 D 2 9 9
You can also use ifelse with your condition. TIME is guaranteed to be 1 in this scenario if there are only 2 group member each with TIME 1 and 2.
df %>%
group_by(ID) %>%
arrange(TIME, .by_group=T) %>%
mutate(consistent=ifelse(is.na(VALUE)&TIME==2, lag(VALUE), VALUE)) %>%
ungroup()
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 8
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 12
6 C 2 13 13
7 D 1 14 14
8 D 2 9 9

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Using dplyr: Within groups, select the first value meeting a condition

I need assistance obtaining a solution that will scan backwards in time and obtain the first value meeting a condition. I have data similar to:
set.seed(42)
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
id time.var x
A 5 2
A 14 8
A 19 7
A 20 1
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10
For the last member of each group defined in time order by time.var, I'd like to obtain the first value from x less than 5 by scanning in descending time order.
I have tried:
test <- df %>%
group_by(id) %>%
arrange(id, time.var) %>%
mutate(less.5 = which.max(x[x < 5]) )
What strategy can I use to obtain this type of output:
id time.var x previous.less.5
A 5 2
A 14 8
A 19 7
A 20 1 2
B 1 1
B 2 5
B 9 10
B 11 10
B 13 6
B 15 4
B 19 3 4
C 1 7
C 3 5
C 8 9
C 8 4
C 17 7
C 17 4
C 17 8
C 19 4
C 19 10 4
Using library(dplyr):
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = tail(c(x[c((x[-n()] < 5), FALSE)]),1)) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), previous.less.5, NULL))
or
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
slice(1:(n()-1)) %>%
filter(x < 5) %>%
slice(n()) %>%
select(-time.var) %>%
right_join(df, ., by="id", suffix =c("",".y")) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(), x.y, NULL)) %>%
select(-x.y)
giving:
#> # A tibble: 20 x 4
#> # Groups: id [3]
#> id time.var x previous.less.5
#> <fct> <int> <int> <int>
#> 1 A 3 10 NA
#> 2 A 4 8 NA
#> 3 A 4 6 NA
#> 4 A 5 2 NA
#> 5 A 5 8 NA
#> 6 A 5 7 NA
#> 7 A 11 6 NA
#> 8 A 13 3 NA
#> 9 A 15 2 3
#> 10 B 2 1 NA
#> 11 B 4 3 NA
#> 12 B 4 6 NA
#> 13 B 8 5 NA
#> 14 B 8 4 NA
#> 15 B 20 7 4
#> 16 C 1 2 NA
#> 17 C 2 10 NA
#> 18 C 10 6 NA
#> 19 C 13 2 NA
#> 20 C 18 5 2
Update:
If there's a group with no record less than 5 (or only last record less than 5) then following works:
df %>%
arrange(id, time.var) %>%
group_by(id) %>%
mutate(previous.less.5 = if_else(row_number() == n(),
max(tail(c( x[ c( x[-n()] < 5, FALSE) ] ), 1)),
NULL)) %>%
mutate(previous.less.5 = replace(previous.less.5, is.infinite(previous.less.5), NA))
Data:
set.seed(42) # I am getting different data than what you've shown with this seed
df <- data.frame(
id = sample(LETTERS[1:3], 20, replace = TRUE),
time.var = sample(1:20, 20, replace = TRUE),
x = sample(c(1:10), 20, replace = TRUE)
)
df <- df[order(df$id, df$time.var),]
We can reverse value of x by id get the first number which is less than 5 using which. The last replace is to assign NA to all the values in previous.less.5 except the last one.
library(dplyr)
df %>%
#Data is already sorted by `id` and `time.var` but if your still need use
#arrange(id, time.var) %>%
group_by(id) %>%
mutate(rev_x = c(NA, rev(x)[-1]), previous.less.5 = rev_x[which(rev_x < 5)[1]],
previous.less.5 = replace(previous.less.5, row_number() != n(), NA)) %>%
select(-rev_x)
# id time.var x previous.less.5
# <fct> <int> <int> <int>
# 1 A 5 2 NA
# 2 A 14 8 NA
# 3 A 19 7 NA
# 4 A 20 1 2
# 5 B 1 1 NA
# 6 B 2 5 NA
# 7 B 9 10 NA
# 8 B 11 10 NA
# 9 B 13 6 NA
#10 B 15 4 NA
#11 B 19 3 4
#12 C 1 7 NA
#13 C 3 5 NA
#14 C 8 9 NA
#15 C 8 4 NA
#16 C 17 7 NA
#17 C 17 4 NA
#18 C 17 8 NA
#19 C 19 4 NA
#20 C 19 10 4
This should also handle the case and return NA's if there is no value less than 5 in an id.

Resources