Returning group maximum and NA using dplyr - r

I need a function I can use that returns both the group maximum and any NA values. Here is toy data:
df <- data.frame(id = rep(1:5,
each = 3),
score = rnorm(15))
df$score[c(3,7,10,14)] <- NA
# id score
# 1 1 -1.4666164
# 2 1 0.4392647
# 3 1 NA
# 4 2 -0.6010311
# 5 2 1.9845774
# 6 2 0.1749082
# 7 3 NA
# 8 3 -0.3089731
# 9 3 0.4427471
# 10 4 NA
# 11 4 1.7156319
# 12 4 -0.2354253
# 13 5 1.1781350
# 14 5 NA
# 15 5 0.0642082
I can use slice_max to get the maximum in each group:
df %>%
group_by(id) %>%
slice_max(score)
# id score
# <int> <dbl>
# 1 1 0.439
# 2 2 1.98
# 3 3 0.443
# 4 4 1.72
# 5 5 1.18
But how do I get the maximum plus any NAs returned?

We can group_by the id column, then use summarize to output the summaries with max. Here, two max are used, with one of them has na.rm = T and the other doesn't. union() is used to combine output that is present in both max.
library(dplyr)
df %>%
group_by(id) %>%
summarize(score = union(max(score, na.rm = T), max(score)))
UPDATE: The above code only works if you have at most one NA per ID. Thanks #KU99 for the reminder.
If you have more than one NA per ID, you need to combine the result of max with the records of NA found by is.na().
df %>%
group_by(id) %>%
summarize(score = c(max(score, na.rm = T), score[is.na(score)]))
Result
# A tibble: 9 × 2
# Groups: id [5]
id score
<int> <dbl>
1 1 0.735
2 1 NA
3 2 0.314
4 3 0.994
5 3 NA
6 4 0.847
7 4 NA
8 5 1.95
9 5 NA
Data
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L), score = c(-1.05089006245306, 0.734652105895187,
NA, -1.31427279695036, -0.250038722057874, 0.314204596436828,
NA, 0.994420599790523, 0.855768431757766, NA, 0.834325037545013,
0.846790152407738, 1.95410525460771, NA, 0.971120269710021)), row.names = c(NA,
-15L), class = "data.frame")

One option would be to use slice and | to create a logical condition with is.na to return the NA rows and the max rows.
library(dplyr)
df %>%
group_by(id) %>%
slice(which(score == max(score, na.rm = T)|is.na(score)))
Another option would be to use slice.max as you did but then to use bind_rows to add the NA values back to the dataframe.
library(dplyr)
df %>%
group_by(id) %>%
slice_max(score) %>%
bind_rows(df %>% filter(is.na(score))) %>%
arrange(id)
Output
id score
<int> <dbl>
1 1 -0.161
2 1 NA
3 2 1.49
4 3 -0.451
5 3 NA
6 4 0.878
7 4 NA
8 5 -0.0652
9 5 NA
Data
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L), score = c(-0.161217942983375, -0.456571996252207,
NA, 0.540071362460494, 1.49325799630099, -0.17985218510166, NA,
-0.451301758592, -0.839100876399644, NA, -0.0432130218441599,
0.87779273806634, -0.339260854059069, NA, -0.065177224102029)), row.names = c(NA,
-15L), class = "data.frame")

Using a custom function you could do:
library(dplyr)
set.seed(123)
slice_max_na <- function(.data, order_by, ..., n, prop, with_ties = TRUE) {
bind_rows(
slice_max(.data, order_by = {{order_by}}, ..., n = n, prop = prop, with_ties = with_ties),
filter(.data, is.na({{order_by}})),
)
}
df %>%
group_by(id) %>%
slice_max_na(score)
#> # A tibble: 9 × 2
#> # Groups: id [5]
#> id score
#> <int> <dbl>
#> 1 1 -0.230
#> 2 2 1.72
#> 3 3 -0.687
#> 4 4 1.22
#> 5 5 0.401
#> 6 1 NA
#> 7 3 NA
#> 8 4 NA
#> 9 5 NA

Here is dplyr version more using rank:
library(dplyr)
df %>%
group_by(id) %>%
mutate(rank = rank(-score, ties.method = "random")) %>%
filter(rank == 1 | is.na(score)) %>%
select(-rank)
id score
<int> <dbl>
1 1 0.505
2 1 NA
3 2 -0.109
4 3 NA
5 3 1.45
6 4 NA
7 4 0.355
8 5 NA
9 5 -0.298

Related

Extracting sequences from columns using R

I have a df which looks like this
ID X003-APP X005-APP X008-APP X003-COP X004-COP X008-PIN X009-PIN
363 NA NA 1 0 NA 4 5
364 0 2 NA 1 5 1 5
678 0 NA NA 5 NA NA NA
713 1 1 1 1 1 1 1
219 1 2 3 3 NA 4 5
234 NA NA NA 2 3 NA NA
321 2 3 1 NA NA 1 2
I am interested in minimum counts for non-null values across the column substrings APP, COP and PIN. My required output is:
ID APP COP PIN
363 1 1 1
364 1 1 1
678 1 1 0
713 1 1 1
219 1 1 1
234 0 1 0
321 1 0 1
For reference, I am sharing the dput():
structure(list(ID = c(363L, 364L, 678L, 713L, 219L, 234L, 321L),
X003.APP = c(NA, 0L, 0L, 1L, 1L, NA, 2L),
X005.APP = c(NA, 2L, NA, 1L, 2L, NA, 3L),
X008.APP = c(1L, NA, NA, 1L, 3L, NA, 1L),
X003.COP = c(0L, 1L, 5L, 1L, 3L, 2L, NA),
X004.COP = c(NA, 5L, NA, 1L, NA, 3L, NA),
X008.PIN = c(4L, 1L, NA, 1L, 4L, NA, 1L),
X009.PIN = c(5L, 5L, NA, 1L, 5L, NA, 2L)),
class = "data.frame", row.names = c(NA, -7L))
Edit:
Later on, I would like to analyse 2 and 3 sequences across IDs. For example, I am ultimately, interested in minimum counts for non-null values across the column substrings APP, COP and PIN. My ultimate required output for a sequence of length 2 would be:
Spec_1 Spec_2 Counts
APP COP 5
APP PIN 5
COP PIN 4
Or correspondingly, my required output for a sequence of length 3 would be:
Spec_1 Spec_2 Spec_3 Counts
APP COP PIN 4
Is there an easy way to achieve this? It would be great to have a solution that could cater for longer sequences - even beyond 3. Thank you very much for your time.
You may try
library(reshape2)
library(tidyverse)
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = as.numeric(any(!is.na(value)))) %>%
filter(value == 1) %>%
pivot_wider(names_from = "Spec", values_from = "value") %>%
replace(is.na(.), 0)
ID APP COP PIN
<int> <dbl> <dbl> <dbl>
1 219 1 1 1
2 234 0 1 0
3 321 1 0 1
4 363 1 1 1
5 364 1 1 1
6 678 1 1 0
7 713 1 1 1
Is your edited one and
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = any(!is.na(value))) %>%
filter(value) %>%
group_by(ID) %>%
filter(n() > 1) %>%
summarise(Spec = combn(Spec, 2, simplify = F)) %>%
unnest_wider(Spec, names_sep = "_") %>%
group_by(Spec_1, Spec_2) %>%
summarize(Counts = n())
Spec_1 Spec_2 Counts
<chr> <chr> <int>
1 APP COP 5
2 APP PIN 5
3 COP PIN 4
is your previous one.
3 seq?
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = any(!is.na(value))) %>%
filter(value) %>%
group_by(ID) %>%
filter(n() > 2) %>%
summarise(Spec = combn(Spec, 3, simplify = F)) %>%
unnest_wider(Spec, names_sep = "_") %>%
group_by(Spec_1, Spec_2, Spec_3) %>%
summarize(Counts = n())
Spec_1 Spec_2 Spec_3 Counts
<chr> <chr> <chr> <int>
1 APP COP PIN 4
Try this using dplyr
library(dplyr)
df |> rowwise() |> transmute( ID,
APP = case_when(all(is.na(c_across(contains("APP")))) ~ 0 , TRUE ~ 1) ,
COP = case_when(all(is.na(c_across(contains("COP")))) ~ 0 , TRUE ~ 1) ,
PIN = case_when(all(is.na(c_across(contains("PIN")))) ~ 0 , TRUE ~ 1)) -> df1
output
# A tibble: 7 × 4
# Rowwise:
ID APP COP PIN
<int> <dbl> <dbl> <dbl>
1 363 1 1 1
2 364 1 1 1
3 678 1 1 0
4 713 1 1 1
5 219 1 1 1
6 234 0 1 0
7 321 1 0 1
for your second required you can use
df1 |> transmute(AC = case_when(sum(c_across(c(APP,COP))) == 2 ~ 1 , TRUE ~ 0) ,
AP = case_when(sum(c_across(c(APP,PIN))) == 2 ~ 1 , TRUE ~ 0) ,
CP = case_when(sum(c_across(c(PIN,COP))) == 2 ~ 1 , TRUE ~ 0) ,
ACP = case_when(sum(c_across(c(APP,COP,PIN))) == 3 ~ 1 , TRUE ~ 0)) |> ungroup() |>
summarise(APP_COP = sum(AC) , APP_PIN = sum(AP) , COP_PIN = sum(CP) , APP_COP_PIN = sum(ACP))
output
# A tibble: 1 × 4
APP_COP APP_PIN COP_PIN APP_COP_PIN
<dbl> <dbl> <dbl> <dbl>
1 5 5 4 4

Determine if any of the values in a set of variables match the value in another variable

For each person in my dataset (1 row per person), I am trying search a set of variables (months, so in my example Jan - Jul) to see if any of them matches the value in a different variable (follow-up month). I want to create a new variable that says yes or no there is a matching value in the set of variables with the 1 variable.
Basically I am trying to create a timeline for a follow-up visit. I have 'Have' and 'Want' data sets below.
Thank you!
HAVE:
ID
Jan
Feb
Mar
Apr
May
June
Jul
Follow-up month
1
NA
2
3
4
NA
NA
NA
4
2
NA
NA
NA
4
NA
NA
NA
6
3
1
NA
3
4
5
NA
NA
5
4
NA
NA
NA
NA
NA
6
7
9
WANT:
ID
Jan
Feb
Mar
Apr
May
June
Jul
Follow-up month
Follow_up_Status
1
NA
2
3
4
NA
NA
NA
4
Yes
2
NA
NA
NA
4
NA
NA
NA
6
No
3
1
NA
3
4
5
NA
NA
5
Yes
4
NA
NA
NA
NA
NA
6
7
9
No
Here is a version with pivoting:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(
-c(ID, Follow.up_month)
) %>%
group_by(ID) %>%
mutate(Follow_up_status = ifelse(Follow.up_month %in% value, "Yes", "No")) %>%
pivot_wider(
names_from = name,
values_from = value
)
output:
ID Follow.up_month Follow_up_status Jan Feb Mar Apr May June Jul
<int> <int> <chr> <int> <int> <int> <int> <int> <int> <int>
1 1 4 Yes NA 2 3 4 NA NA NA
2 2 6 No NA NA NA 4 NA NA NA
3 3 5 Yes 1 NA 3 4 5 NA NA
4 4 9 No NA NA NA NA NA 6 7
I think rowwise and if_any will work for you:
library(dplyr)
quux %>%
rowwise() %>%
mutate(
Follow2_int = which(c_across(Jan:Jul) %in% Follow.up.month)[1],
Follow2_lgl = !is.na(Follow2_int)
) %>%
ungroup()
# # A tibble: 4 x 12
# ID Jan Feb Mar Apr May June Jul Follow.up.month Follow_up_Status Follow2_int Follow2_lgl
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <chr> <int> <lgl>
# 1 1 NA 2 3 4 NA NA NA 4 Yes 4 TRUE
# 2 2 NA NA NA 4 NA NA NA 6 No NA FALSE
# 3 3 1 NA 3 4 5 NA NA 5 Yes 5 TRUE
# 4 4 NA NA NA NA NA 6 7 9 No NA FALSE
Edited to include both the logical and the first column number (counting within Jan:Jul) that matches.
Data
quux <- structure(list(ID = 1:4, Jan = c(NA, NA, 1L, NA), Feb = c(2L, NA, NA, NA), Mar = c(3L, NA, 3L, NA), Apr = c(4L, 4L, 4L, NA), May = c(NA, NA, 5L, NA), June = c(NA, NA, NA, 6L), Jul = c(NA, NA, NA, 7L), Follow.up.month = c(4L, 6L, 5L, 9L), Follow_up_Status = c("Yes", "No", "Yes", "No")), class = "data.frame", row.names = c(NA, -4L))
Another dplyr solution.
library(dplyr)
dat2 <- dat %>%
mutate(across(Jan:Jul, .fns = ~.x - Follow_up_month == 0)) %>%
mutate(Follow_up_status = as.character(rowSums(select(., Jan:Jul), na.rm = TRUE))) %>%
transmute(Follow_up_status = recode(Follow_up_status, "0" = "No", "1" = "Yes")) %>%
bind_cols(dat, .)
dat2
# ID Jan Feb Mar Apr May June Jul Follow_up_month Follow_up_status
# 1 1 NA 2 3 4 NA NA NA 4 Yes
# 2 2 NA NA NA 4 NA NA NA 6 No
# 3 3 1 NA 3 4 5 NA NA 5 Yes
# 4 4 NA NA NA NA NA 6 7 9 No
Date
dat <- structure(list(ID = 1:4, Jan = c(NA, NA, 1L, NA), Feb = c(2L, NA, NA, NA), Mar = c(3L, NA, 3L, NA), Apr = c(4L, 4L, 4L, NA), May = c(NA, NA, 5L, NA), June = c(NA, NA, NA, 6L), Jul = c(NA, NA, NA, 7L), Follow_up_month = c(4L, 6L, 5L, 9L)), class = "data.frame", row.names = c(NA, -4L))
Performance
When the data frame is small, all the solutions here will work. But when data frame is large, the pivoting approach and the rowwise approach may be slow. Below I tried to show the performance comparison of the three solutions. Although the final outputs are different, with different data type and column order, I will still compare them, assuming that these differences are acceptable.
Here is the setup.
library(microbenchmark)
library(dplyr)
library(tidyr)
pivot_fun <- function(x){
x2 <- x %>%
pivot_longer(
-c(ID, Follow_up_month)
) %>%
group_by(ID) %>%
mutate(Follow_up_status = ifelse(Follow_up_month %in% value, "Yes", "No")) %>%
pivot_wider(
names_from = name,
values_from = value
)
return(x2)
}
rowwise_fun <- function(x){
x2 <- x %>%
pivot_longer(
-c(ID, Follow_up_month)
) %>%
group_by(ID) %>%
mutate(Follow_up_status = ifelse(Follow_up_month %in% value, "Yes", "No")) %>%
pivot_wider(
names_from = name,
values_from = value
)
return(x2)
}
rowSums_fun <- function(x){
x2 <- x %>%
mutate(across(Jan:Jul, .fns = ~.x - Follow_up_month == 0)) %>%
mutate(Follow_up_status = as.character(rowSums(select(., Jan:Jul), na.rm = TRUE))) %>%
transmute(Follow_up_status = recode(Follow_up_status, "0" = "No", "1" = "Yes")) %>%
bind_cols(x, .)
return(x2)
}
Here is the comparison on the original example. The solution provided in this post is the fastest.
set.seed(1)
microbenchmark(pivot_fun(dat), rowwise_fun(dat), rowSums_fun(dat))
# Unit: milliseconds
# expr min lq mean median uq max neval
# pivot_fun(dat) 11.037401 11.927201 13.58003 12.659001 13.882151 30.0207 100
# rowwise_fun(dat) 10.907602 11.670701 13.56004 12.295051 13.614201 24.4249 100
# rowSums_fun(dat) 6.590502 7.147702 8.48469 7.714351 8.808602 17.0109 100
And here is a comparison on a larger data frame. The solution provided in this post is about 10 times faster than other answers.
set.seed(12)
n <- 100000
dat_n <- data.frame(
ID = 1:n,
Jan = sample(dat$Jan, size = n, replace = TRUE),
Feb = sample(dat$Feb, size = n, replace = TRUE),
Mar = sample(dat$Mar, size = n, replace = TRUE),
Apr = sample(dat$Apr, size = n, replace = TRUE),
May = sample(dat$May, size = n, replace = TRUE),
June = sample(dat$June, size = n, replace = TRUE),
Jul = sample(dat$Jul, size = n, replace = TRUE),
Follow_up_month = sample(1:12, size = n, replace = TRUE)
)
set.seed(123)
microbenchmark(pivot_fun(dat_n), rowwise_fun(dat_n), rowSums_fun(dat_n))
# Unit: milliseconds
# expr min lq mean median uq max neval
# pivot_fun(dat_n) 1168.416 1405.5724 1496.6545 1471.0253 1574.3927 2327.1624 100
# rowwise_fun(dat_n) 1159.790 1401.0586 1494.9987 1465.8929 1580.0092 1982.5099 100
# rowSums_fun(dat_n) 84.494 102.0946 122.2843 111.8158 123.6288 296.3234 100

Counting occurencies in every column in R

Hello I need to count the occurencies of every number in each column.
Example data-frame:
A B C
2 1 2
2 1 1
1 1 3
3 3 3
3 2 2
2 1 2
I want my output to look like this
how_much A B C
1 1 4 1
2 3 1 3
3 2 1 2
In tidyverse you could do:
library(tidyverse)
gather(df1) %>%
group_by(key,value) %>%
count() %>%
pivot_wider(value, names_from = key, values_from = n, values_fill = 0)
value A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2
We can use table
table(unlist(df1), names(df1)[c(col(df1))])
-output
A B C
1 1 4 1
2 3 1 3
3 2 1 2
Or loop over the columns with sapply, and apply table
sapply(df1, table)
A B C
1 1 4 1
2 3 1 3
3 2 1 2
data
df1 <- structure(list(A = c(2L, 2L, 1L, 3L, 3L, 2L), B = c(1L, 1L, 1L,
3L, 2L, 1L), C = c(2L, 1L, 3L, 3L, 2L, 2L)),
class = "data.frame", row.names = c(NA,
-6L))
In order for the solution to be more flexible and can be used for any occurrence of numbers we can use the following solution using purrr package functions.
library(dplyr)
library(purrr)
df1 %>%
map(~ unique(.x) %>% sort()) %>% reduce(~ union(..1, ..2)) %>%
bind_cols(map_dfr(., ~ map_dfc(df1, function(a) sum(a == .x)))) %>%
rename(what = ...1)
# A tibble: 3 x 4
what A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2
A slightly verbose answer, but it will work on all data types.
set.seed(1234)
df1 <- data.frame(A = sample(letters[1:3], 8, T),
B = sample(letters[1:3], 8, T),
C = sample(letters[1:3], 8, T))
df1
#> A B C
#> 1 b c b
#> 2 b b a
#> 3 a b c
#> 4 c b c
#> 5 a c c
#> 6 a b a
#> 7 b b b
#> 8 b b a
library(tidyverse)
unique(unlist(apply(df1, 1, unique))) %>% as.data.frame() %>% setNames('how_much') %>%
bind_cols(map_df(unique(unlist(apply(df1, 1, unique))), ~map_int(df1, \(x) sum(x %in% .x) ) ))
#> how_much A B C
#> 1 b 4 6 2
#> 2 c 1 2 3
#> 3 a 3 0 3
Created on 2021-06-23 by the reprex package (v2.0.0)

How to calculate mean , min, and max across when grouping using dplyr?

So I have a data frame , simplified as this:
ID A B C
1 1 5 0
2 3 0 3
3 0 2 1
2 5 9 1
3 3 5 3
1 2 6 4
Simply put, I want to calculate the following for each row:
Mean
Median
Max
Min
Easy enough, but the hard part for me is after taking each, how do I create a mean value to represent each ID.
So after I get these values, how do I show the AVERAGE MEAN/MED/MAX/MIN for each ID???
Expected output:
(1)
ID Mean Median Min Max
1 2 1 0 5
2 2 3 0 3
3 1 1 0 2
2 5 5 1 9
3 3.66 3 3 5
1 4 4 2 6
(2)
ID AvgMean AvgMedian AvgMin AvgMax
1 3 2.5 1 5.5
2 3.5 4 1 6
3 2.33 3 3 3.5
You can try something like this:
library(dplyr)
df %>%
group_by(ID) %>%
summarise(mean_ = mean(c_across(A:C), na.rm = T),
medi_ = median(c_across(A:C), na.rm = T),
max_ = max(c_across(A:C), na.rm = T),
min_ = min(c_across(A:C), na.rm = T))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 3 x 5
ID mean_ medi_ max_ min_
<int> <dbl> <dbl> <int> <int>
1 1 3 3 6 0
2 2 3.5 3 9 0
3 3 2.33 2.5 5 0
For the second part:
df %>%
rowwise() %>%
summarise(mean_ = mean(c_across(A:C), na.rm = T),
medi_ = median(c_across(A:C), na.rm = T),
max_ = max(c_across(A:C), na.rm = T),
min_ = min(c_across(A:C), na.rm = T))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 6 x 4
mean_ medi_ max_ min_
<dbl> <int> <int> <int>
1 2 1 5 0
2 2 3 3 0
3 1 1 2 0
4 5 5 9 1
5 3.67 3 5 3
6 4 4 6 2
With data:
df <- structure(list(ID = c(1L, 2L, 3L, 2L, 3L, 1L), A = c(1L, 3L,
0L, 5L, 3L, 2L), B = c(5L, 0L, 2L, 9L, 5L, 6L), C = c(0L, 3L,
1L, 1L, 3L, 4L)), class = "data.frame", row.names = c(NA, -6L
))
thanks for posting expected output. I would consider using summarize and across together
library(dplyr)
df <- df %>%
group_by(ID)
summarize(across(2:4, mean))
In base R, the following seems to do what the question asks for.
out is a data.frame of the statistics ungrouped and out2 of the grouped ones.
fun <- function(X){
f <- function(x, na.rm = FALSE){
c(
Mean = mean(x, na.rm = na.rm),
Median = median(x, na.rm = na.rm),
Min = min(x, na.rm = na.rm),
Max = max(x, na.rm = na.rm)
)
}
t(apply(X, 1, f))
}
out <- lapply(split(df1[-1], df1$ID), fun)
out2 <- lapply(out, colMeans)
out <- do.call(rbind, out)
out <- cbind.data.frame(ID = row.names(out), out)
out2 <- cbind.data.frame(ID = names(out2), do.call(rbind, out2))
out
# ID Mean Median Min Max
#1 1 2.000000 1 0 5
#6 6 4.000000 4 2 6
#2 2 2.000000 3 0 3
#4 4 5.000000 5 1 9
#3 3 1.000000 1 0 2
#5 5 3.666667 3 3 5
out2
# ID Mean Median Min Max
#1 1 3.000000 2.5 1.0 5.5
#2 2 3.500000 4.0 0.5 6.0
#3 3 2.333333 2.0 1.5 3.5

Sorting and calculating sum and rank with new columns in R

I have 200 columns and want to calculate mean and rank and then generate columns. Here is an example of data
df<-read.table(text="Q1a Q2a Q3b Q4c Q5a Q6c Q7b
1 2 4 2 2 0 1
3 2 1 2 2 1 1
4 3 2 1 1 1 1",h=T)
I want to sum a, b and c for each row, and then sum them together. Next I want to calculate the rank for each row. I want to generate the following table:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 2 4 2 2 0 1 5 5 2 12 2
3 2 1 2 2 1 1 7 2 3 12 2
4 3 2 1 1 1 1 8 3 2 13 1
library(dplyr)
df %>%
cbind(sapply(c('a', 'b', 'c'), function(x) rowSums(.[, grep(x, names(.)), drop=FALSE]))) %>%
mutate(Total = a + b + c,
Rank = match(Total, sort(Total, decreasing = T)))
Output is:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 1 2 4 2 2 0 1 5 5 2 12 2
2 3 2 1 2 2 1 1 7 2 3 12 2
3 4 3 2 1 1 1 1 8 3 2 13 1
Sample data:
df <- structure(list(Q1a = c(1L, 3L, 4L), Q2a = c(2L, 2L, 3L), Q3b = c(4L,
1L, 2L), Q4c = c(2L, 2L, 1L), Q5a = c(2L, 2L, 1L), Q6c = c(0L,
1L, 1L), Q7b = c(1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-3L))
You can also go with the tidyverse approach. However, it is longer.
library(tidyverse)
df %>%
rownames_to_column(var = "ID") %>%
gather(question, value, -ID) %>%
mutate(type = substr(question, 3,3)) %>%
group_by(ID, type) %>%
summarise(sumType = sum(value, na.rm = TRUE)) %>%
as.data.frame() %>%
spread(type, sumType) %>%
mutate(Total = a+b+c,
Rank = match(Total, sort(Total, decreasing = T)))
Results:
ID a b c Total Rank
1 1 5 5 2 12 2
2 2 7 2 3 12 2
3 3 8 3 2 13 1

Resources