rowSums After Slicing in R - r

I have dataframe like this.
First I arrange and then slice by DPP.
Like this:
But after arrange and slice. I cant have rowsums
card_201406 <- data.frame(ID = c(123, 234, 344, 456, 678, 124, 567, 256, 345),
Block_Code = c("D", "U","Z", "G","T","R","A","U", "B"),
DPP = c(1,2,2,3,3,3,4,5,1),
a = 1:9, a_1 = 1:9, a_2 = 1:9, a_3 = 1:9, a_4 = 1:9)
card_201406 <- card_201406 %>% arrange(DPP) %>% group_by(DPP) %>% slice(1)
card_201406 <- card_201406 %>%
mutate(SUM_a = rowSums(do.call(cbind, select(.,starts_with("a_")))))
RESULT:
Adding missing grouping variables: `DPP`
Error in `mutate()`:
! Problem while computing `SUM_a = rowSums(do.call(cbind, select(.,
starts_with("a_"))))`.
x `SUM_a` must be size 1, not 5.
i The error occurred in group 1: DPP = 1.
Run `rlang::last_error()` to see where the error occurred.
I just want sum perrows
Thanks for helping

Haven't checked what's wrong but you could simplify using across:
library(dplyr, warn=FALSE)
card_201406 %>%
arrange(DPP) %>%
group_by(DPP) %>%
slice(1) %>%
mutate(SUM_a = rowSums(across(starts_with("a_"))))
#> # A tibble: 5 × 9
#> # Groups: DPP [5]
#> ID Block_Code DPP a a_1 a_2 a_3 a_4 SUM_a
#> <dbl> <chr> <dbl> <int> <int> <int> <int> <int> <dbl>
#> 1 123 D 1 1 1 1 1 1 4
#> 2 234 U 2 2 2 2 2 2 8
#> 3 456 G 3 4 4 4 4 4 16
#> 4 567 A 4 7 7 7 7 7 28
#> 5 256 U 5 8 8 8 8 8 32

One way to solve your problem:
library(dplyr)
card_201406 %>%
arrange(DPP) %>%
filter(!duplicated(DPP)) %>% # keep first row of each group
mutate(SUM_a = rowSums(.[grep("a_", names(.))]))
ID Block_Code DPP a a_1 a_2 a_3 a_4 SUM_a
1 123 D 1 1 1 1 1 1 4
2 234 U 2 2 2 2 2 2 8
3 456 G 3 4 4 4 4 4 16
4 567 A 4 7 7 7 7 7 28
5 256 U 5 8 8 8 8 8 32

Related

How to count distinct non-missing rows in R using dplyr

library(dplyr)
mydat <- data.frame(id = c(123, 111, 234, "none", 123, 384, "none"),
id2 = c(1, 1, 1, 2, 2, 3, 4))
> mydat
id id2
1 123 1
2 111 1
3 234 1
4 none 2
5 123 2
6 384 3
7 none 4
I would like to count the number of unique ids for each id2 in medal. However, for the id that is none, I do not want to count it.
> mydat %>% group_by(id2) %>% summarise(count = n_distinct(id))
# A tibble: 4 × 2
id2 count
<dbl> <int>
1 1 3
2 2 2
3 3 1
4 4 1
Using this mistakenly counts none. The desired output should be
> mydat %>% group_by(id2) %>% summarise(count = n_distinct(id))
# A tibble: 4 × 2
id2 count
<dbl> <int>
1 1 3
2 2 1
3 3 1
4 4 0
mydat %>% group_by(id2) %>%
summarise(
count = n_distinct(id),
wanted = n_distinct(id[id != "none"])
)
# # A tibble: 4 × 3
# id2 count wanted
# <dbl> <int> <int>
# 1 1 3 3
# 2 2 2 1
# 3 3 1 1
# 4 4 1 0

grouping to aggregate values, but tripping up on NA's

I have long data, and I am trying to make a new variable (consistent) that is the value for a given column (VALUE), for each person (ID), at TIME = 2. I used the code below to do this, but I am getting tripped up on NA's. If the VALUE for TIME = 2 is NA, then I want it to grab the VALUE at TIME = 1 instead. That part I'm not sure how to do. So, in the example below, I want the new variable (consistent) should be 10 instead of NA.
ID = c("A", "A", "B", "B", "C", "C", "D", "D")
TIME = c(1, 2, 1, 2, 1, 2, 1, 2)
VALUE = c(8, 9, 10, NA, 12, 13, 14, 9)
df = data.frame(ID, TIME, VALUE)
df <- df %>%
group_by(ID) %>%
mutate(consistent = VALUE[TIME == 2]) %>% ungroup
df
If we want to use the same code, then coalesce with the 'VALUE' where 'TIME' is 1 (assuming there is a single observation of 'TIME' for each 'ID')
library(dplyr)
df %>%
group_by(ID) %>%
mutate(consistent = coalesce(VALUE[TIME == 2], VALUE[TIME == 1])) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 9
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 13
6 C 2 13 13
7 D 1 14 9
8 D 2 9 9
Or another option is to arrange before doing the group_by and get the first element of 'VALUE' (assuming no replicating for 'TIME')
df %>%
arrange(ID, is.na(VALUE), desc(TIME)) %>%
group_by(ID) %>%
mutate(consistent = first(VALUE)) %>%
ungroup
-output
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 2 9 9
2 A 1 8 9
3 B 1 10 10
4 B 2 NA 10
5 C 2 13 13
6 C 1 12 13
7 D 2 9 9
8 D 1 14 9
Another possible solution, using tidyr::fill:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(consistent = VALUE) %>% fill(consistent) %>% ungroup
#> # A tibble: 8 × 4
#> ID TIME VALUE consistent
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 1 8 8
#> 2 A 2 9 9
#> 3 B 1 10 10
#> 4 B 2 NA 10
#> 5 C 1 12 12
#> 6 C 2 13 13
#> 7 D 1 14 14
#> 8 D 2 9 9
You can also use ifelse with your condition. TIME is guaranteed to be 1 in this scenario if there are only 2 group member each with TIME 1 and 2.
df %>%
group_by(ID) %>%
arrange(TIME, .by_group=T) %>%
mutate(consistent=ifelse(is.na(VALUE)&TIME==2, lag(VALUE), VALUE)) %>%
ungroup()
# A tibble: 8 × 4
ID TIME VALUE consistent
<chr> <dbl> <dbl> <dbl>
1 A 1 8 8
2 A 2 9 9
3 B 1 10 10
4 B 2 NA 10
5 C 1 12 12
6 C 2 13 13
7 D 1 14 14
8 D 2 9 9

How to replace specific rows with their column sums in R?

I feel like there should be a simpler way of doing this. Here is my sample data.
df <-
tibble(
group1 = c(1,1,2,2,3,3,3,3),
group2 = c("A", "B", "A", "B", "A", "B", "A", "B"),
vals = c(13,56,15,50,5,22,9,59)
)
df
# A tibble: 8 x 3
group1 group2 vals
<dbl> <chr> <dbl>
1 1 A 13
2 1 B 56
3 2 A 15
4 2 B 50
5 3 A 5
6 3 B 22
7 3 A 9
8 3 B 59
I want to combine the vals where group1 is 3 and replace the summed rows with the old ones. Can anyone come up with a cleaner/tidier solution than this?
df %>%
group_by(group1, group2) %>%
bind_rows(
summarize(
.[.$group1 == 3,],
across(vals, sum),
summed = "x"
)
) %>%
ungroup() %>%
filter(!(group1 == 3 & is.na(summed))) %>%
select(-summed)
Here is what the result should be:
# A tibble: 6 x 3
group1 group2 vals
<dbl> <chr> <dbl>
1 1 A 13
2 1 B 56
3 2 A 15
4 2 B 50
5 3 A 14
6 3 B 81
This isn't very efficient, but it gives you your intended output.
df %>%
mutate(tmp = if_else(group1 == 3, 0L, row_number())) %>%
group_by(tmp, group1, group2) %>%
summarize(vals = sum(vals)) %>%
ungroup() %>%
select(-tmp)
# # A tibble: 6 x 3
# group1 group2 vals
# <dbl> <chr> <dbl>
# 1 3 A 14
# 2 3 B 81
# 3 1 A 13
# 4 1 B 56
# 5 2 A 15
# 6 2 B 50
Another technique would be to split your data into "3" and "not 3", process the "3" data, then recombine them.
df3 <- filter(df, group1 == 3)
dfnot3 <- filter(df, group1 != 3)
df3 %>%
group_by(group1, group2) %>%
summarize(vals = sum(vals)) %>%
ungroup() %>%
bind_rows(dfnot3)
# # A tibble: 6 x 3
# group1 group2 vals
# <dbl> <chr> <dbl>
# 1 3 A 14
# 2 3 B 81
# 3 1 A 13
# 4 1 B 56
# 5 2 A 15
# 6 2 B 50
(This second one is really only meaningful/efficient if you have lots of non-3 rows.)

Finding rowwise minimum and column index in a tibble

I have the following tibble:
> df <- tibble(
ID = LETTERS[1:4],
a = c(1,5,9,8),
b = c(5,9,8,2),
c = c(5,4,5,5)
)
> df
# A tibble: 4 x 4
ID a b c
<chr> <dbl> <dbl> <dbl>
1 A 1 5 5
2 B 5 9 4
3 C 9 8 5
4 D 8 2 5
>
What I want is to get the rowwise minimum of columns a:c and also the column index from this minimum.
The output tabel should look like this:
# A tibble: 4 x 6
ID a b c Min Col_Index
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
I don't want to use rowwise()!
Thank you!
You could use pmin with do.call to get rowwise minimum and negate the values to use with max.col to get the column index of minimum.
library(dplyr)
library(purrr)
df %>%
mutate(Min = do.call(pmin, select(., a:c)),
Col_Index = max.col(-select(., a:c)))
# ID a b c Min Col_Index
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
#1 A 1 5 5 1 1
#2 B 5 9 4 4 3
#3 C 9 8 5 5 3
#4 D 8 2 5 2 2
Using purrr's pmap_dbl :
df %>%
mutate(Min = pmap_dbl(select(., a:c), ~min(c(...))),
Col_Index = pmap_dbl(select(., a:c), ~which.min(c(...))))
One option could be:
df %>%
rowwise() %>%
mutate(min = min(c_across(a:c)),
min_index = which.min(c_across(a:c)))
ID a b c min min_index
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 A 1 5 5 1 1
2 B 5 9 4 4 3
3 C 9 8 5 5 3
4 D 8 2 5 2 2
Base R solution:
setNames(cbind(df, t(apply(df[, vapply(df, is.numeric, logical(1))], 1, function(row) {
cbind(min(row), which.min(row))}))), c(names(df), "min", "col_index"))

How to create combinations of values of one variable by group using tidyverse in R

I am using the combn function in R to get all the combinations of the values of variable y taking each time 2 values, grouping by the values of x. My expected final result is the tibble c.
But when I try to do it in tidyverse something is (very) wrong.
library(tidyverse)
df <- tibble(x = c(1, 1, 1, 2, 2, 2, 2),
y = c(8, 9, 7, 3, 5, 2, 1))
# This is what I want
a <- combn(df$y[df$x == 1], 2)
a <- rbind(a, rep(1, ncol(a)))
b <- combn(df$y[df$x == 2], 2)
b <- rbind(b, rep(2, ncol(b)))
c <- cbind(a, b)
c <- tibble(c)
c <- t(c)
# but using tidyverse it does not work
df %>% group_by(x) %>% mutate(z = combn(y, 2))
#> Error: Problem with `mutate()` input `z`.
#> x Input `z` can't be recycled to size 3.
#> i Input `z` is `combn(y, 2)`.
#> i Input `z` must be size 3 or 1, not 2.
#> i The error occurred in group 1: x = 1.
Created on 2020-11-18 by the reprex package (v0.3.0)
Try with combn
out = df %>% group_by(x) %>% do(data.frame(t(combn(.$y, 2))))
# A tibble: 9 x 3
# Groups: x [2]
x X1 X2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
If you have dplyr v1.0.2, you can do this
df %>% group_by(x) %>% group_modify(~as_tibble(t(combn(.$y, 2L))))
Output
# A tibble: 9 x 3
# Groups: x [2]
x V1 V2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
An option with summarise and unnest
library(dplyr)
library(tidyr)
df %>%
group_by(x) %>%
summarise(y = list(as.data.frame(t(combn(y, 2)))), .groups = 'drop') %>%
unnest(c(y))
# A tibble: 9 x 3
# x V1 V2
# <dbl> <dbl> <dbl>
#1 1 8 9
#2 1 8 7
#3 1 9 7
#4 2 3 5
#5 2 3 2
#6 2 3 1
#7 2 5 2
#8 2 5 1
#9 2 2 1

Resources