Summing values in R based on column value with dplyr - r

I have a data set that has the following information:
Subject Value1 Value2 Value3 UniqueNumber
001 1 0 1 3
002 0 1 1 2
003 1 1 1 1
If the value of UniqueNumber > 0, I would like to sum the values with dplyr for each subject from rows 1 through UniqueNumber and calculate the mean. So for Subject 001, sum = 2 and mean = .67.
total = 0;
average = 0;
for(i in 1:length(Data$Subject)){
for(j in 1:ncols(Data)){
if(Data$UniqueNumber[i] > 0){
total[i] = sum(Data[i,1:j])
average[i] = mean(Data[i,1:j])
}
}
Edit: I am only looking to sum through the number of columns listed in the 'UniqueNumber' column. So this is looping through every row and stopping at column listed in 'UniqueNumber'.
Example: Row 2 with Subject 002 should sum up the values in columns 'Value1' and 'Value2', while Row 3 with Subject 003 should only sum the value in column 'Value1'.

Not a tidyverse fan/expert, but I would try this using long format. Then, just filter by row index per group and then run any functions you want on a single column (much easier this way).
library(tidyr)
library(dplyr)
Data %>%
gather(variable, value, -Subject, -UniqueNumber) %>% # long format
group_by(Subject) %>% # group by Subject in order to get row counts
filter(row_number() <= UniqueNumber) %>% # filter by row index
summarise(Mean = mean(value), Total = sum(value)) %>% # do the calculations
ungroup()
## A tibble: 3 x 3
# Subject Mean Total
# <int> <dbl> <int>
# 1 1 0.667 2
# 2 2 0.5 1
# 3 3 1 1
A very similar way to achieve this could be filtering by the integers in the column names. The filter step comes before the group_by so it could potentially increase performance (or not?) but it is less robust as I'm assuming that the cols of interest are called "Value#"
Data %>%
gather(variable, value, -Subject, -UniqueNumber) %>% #long format
filter(as.numeric(gsub("Value", "", variable, fixed = TRUE)) <= UniqueNumber) %>% #filter
group_by(Subject) %>% # group by Subject
summarise(Mean = mean(value), Total = sum(value)) %>% # do the calculations
ungroup()
## A tibble: 3 x 3
# Subject Mean Total
# <int> <dbl> <int>
# 1 1 0.667 2
# 2 2 0.5 1
# 3 3 1 1
Just for fun, adding a data.table solution
library(data.table)
data.table(Data) %>%
melt(id = c("Subject", "UniqueNumber")) %>%
.[as.numeric(gsub("Value", "", variable, fixed = TRUE)) <= UniqueNumber,
.(Mean = round(mean(value), 3), Total = sum(value)),
by = Subject]
# Subject Mean Total
# 1: 1 0.667 2
# 2: 2 0.500 1
# 3: 3 1.000 1

Here is another method that uses tidyr::nest to collect the Values columns into a list so that we can iterate through the table with map2. In each row, we select the correct values from the Values list-col and take the sum or mean respectively.
library(tidyverse)
tbl <- read_table2(
"Subject Value1 Value2 Value3 UniqueNumber
001 1 0 1 3
002 0 1 1 2
003 1 1 1 1"
)
tbl %>%
filter(UniqueNumber > 0) %>%
nest(starts_with("Value"), .key = "Values") %>%
mutate(
sum = map2_dbl(UniqueNumber, Values, ~ sum(.y[1:.x], na.rm = TRUE)),
mean = map2_dbl(UniqueNumber, Values, ~ mean(as.numeric(.y[1:.x], na.rm = TRUE))),
)
#> # A tibble: 3 x 5
#> Subject UniqueNumber Values sum mean
#> <chr> <dbl> <list> <dbl> <dbl>
#> 1 001 3 <tibble [1 × 3]> 2 0.667
#> 2 002 2 <tibble [1 × 3]> 1 0.5
#> 3 003 1 <tibble [1 × 3]> 1 1
Created on 2019-02-14 by the reprex package (v0.2.1)

Check this solution:
df %>%
gather(key, val, Value1:Value3) %>%
group_by(Subject) %>%
mutate(
Sum = sum(val[c(1:(UniqueNumber[1]))]),
Mean = mean(val[c(1:(UniqueNumber[1]))]),
) %>%
spread(key, val)
Output:
Subject UniqueNumber Sum Mean Value1 Value2 Value3
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 001 3 2 0.667 1 0 1
2 002 2 1 0.5 0 1 1
3 003 1 1 1 1 1 1

OP might be interested only for dplyr solution but for comparison purposes and for future readers a base R option using mapply
cols <- grep("^Value", names(df))
cbind(df, t(mapply(function(x, y) {
if (y > 0) {
vals = as.numeric(df[x, cols[1:y]])
c(Sum = sum(vals, na.rm = TRUE), Mean = mean(vals, na.rm = TRUE))
}
else
c(0, 0)
},1:nrow(df), df$UniqueNumber)))
# Subject Value1 Value2 Value3 UniqueNumber Sum Mean
#1 1 1 0 1 3 2 0.667
#2 2 0 1 1 2 1 0.500
#3 3 1 1 1 1 1 1.000
Here we subset each row based on its respective UniqueNumber and then calculate it's sum and mean if the UniqueNumber value is greater than 0 or else return only 0.

A solution that uses purrr::map_df(which is from the same author as dplyr).
library(dplyr)
library(purrr)
l_dat <- split(dat, dat$Subject) # first we need to split in a list
map_df(l_dat, function(x) {
n_cols <- x$UniqueNumber # finds the number of columns
x <- as.numeric(x[2:(n_cols+1)]) # subsets x and converts to numeric
mean(x, na.rm=T) # mean to be returned
})
# output:
# # A tibble: 1 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
# 1 0.667 0.5 1
Another option (output format closer to a dplyr solution):
map_df(l_dat, function(x) {
n_cols <- x$UniqueNumber
id <- x$Subject
x <- as.numeric(x[2:(n_cols+1)])
tibble(id=id, mean_values=mean(x, na.rm=T))
})
# # A tibble: 3 x 2
# id mean_values
# <int> <dbl>
# 1 1 0.667
# 2 2 0.5
# 3 3 1
Just as an example I added a sum() then divided by length(x)-1:
map_df(l_dat, function(x) {
n_cols <- x$UniqueNumber
id <- x$Subject
x <- as.numeric(x[2:(n_cols+1)])
tibble(id=id,
mean_values=sum(x, na.rm=T)/(length(x)-1)) # change here
})
# # A tibble: 3 x 2
# id mean_values
# <int> <dbl>
# 1 1 1.
# 2 2 1.
# 3 3 Inf #beware of this case where you end up dividing by 0
Data:
tt <- "Subject Value1 Value2 Value3 UniqueNumber
001 1 0 1 3
002 0 1 1 2
003 1 1 1 1"
dat <- read.table(text=tt, header=T)

I think the easiest way is to set to NA the zeros that really should be NA, then use rowSums and rowMeans on the appropriate subset of columns.
Data[2:4][(col(dat[2:4])>dat[[5]])] <- NA
Data
# Subject Value1 Value2 Value3 UniqueNumber
# 1 1 1 0 1 3
# 2 2 0 1 NA 2
# 3 3 1 NA NA 1
library(dplyr)
Data%>%
mutate(sum = rowSums(.[2:4], na.rm = TRUE),
mean = rowMeans(.[2:4], na.rm = TRUE))
# Subject Value1 Value2 Value3 UniqueNumber sum mean
# 1 1 1 0 1 3 2 0.6666667
# 2 2 0 1 NA 2 1 0.5000000
# 3 3 1 NA NA 1 1 1.0000000
or transform(Data, sum = rowSums(Data[2:4],na.rm = TRUE), mean = rowMeans(Data[2:4],na.rm = TRUE)) to stay in base R.
data
Data <- structure(
list(Subject = 1:3,
Value1 = c(1L, 0L, 1L),
Value2 = c(0L, 1L, NA),
Value3 = c(1L, NA, NA),
UniqueNumber = c(3L, 2L, 1L)),
.Names = c("Subject","Value1", "Value2", "Value3", "UniqueNumber"),
row.names = c(NA, 3L), class = "data.frame")

Related

R Count duplicates between two dataframes

I have two dataframes df1 and df2. They both have a column 'ID'. For each row in DF1, I would like to find out how many duplicates of its ID there are in df2 and add the count to that row. If there are no duplicates, the count should return as 0.
# # A tibble: 4 x 3
# ID a b
# <dbl> <dbl> <dbl>
# 1 1_234 1 1
# 2 1_235 1 2
# 3 2_222 1 1
# 4 2_654 1 2
# # A tibble: 4 x 3
# ID a b
# <dbl> <dbl> <dbl>
# 1 1_234 1 1
# 2 1_235 1 2
# 3 1_234 1 1
# 4 3_234 1 2
Using dplyr:
Your data:
df1 <- data.frame(ID = c("1_234","1_235","2_222","2_654"),
a = c(1,1,1,1),
b = c(1,2,1,2))
df2 <- data.frame(ID = c("1_234","1_235","1_234","3_235"),
a = c(1,1,1,1),
b = c(1,2,1,2))
Edit: considering only the IDs:
output <- left_join(df1,
as.data.frame(table(df2$ID)),
by = c("ID" = "Var1")) %>%
mutate(Freq = ifelse(is.na(Freq), 0, Freq))
Output:
ID a b Freq
1 1_234 1 1 2
2 1_235 1 2 1
3 2_222 1 1 0
4 2_654 1 2 0
A base R option using subset + aggregate
subset(
aggregate(
n ~ .,
rbind(
cbind(df1, n = 1),
cbind(df2, n = 1)
), function(x) length(x) - 1
), ID %in% df1$ID
)
gives
ID a b n
1 1_234 1 1 2
2 2_222 1 1 0
3 1_235 1 2 1
4 2_654 1 2 0
I think you can do it with a simple sapply() and base r (no extra packages).
df1$count <- sapply(df1$ID, function(x) sum(df2$ID == x))
We may also use outer
df1$count <- rowSums(outer(df1$ID, df2$ID, FUN = `==`))
df1$count
[1] 2 1 0 0
We could use semi_join and n() to get the count of duplicates:
library(dplyr)
df1 %>%
semi_join(df2, by="ID") %>%
summarise(duplicates_df1_df2 = n())
Output:
duplicates_df1_df2
1 2

Comparison across unique readers

Reprex
dat <- data.frame(id = c(1,1,2,2,3,3,4,4),
reader = c(1,4,2,3,3,4,2,5),
response = c("CR","PR","SD","SD","PR","PR","CR","SD"))
Problem: Wish to compare response across each unique reader for each id. There are 5 unique readers in total, but each id only has 2 individual readers.
The resulting dataset would look something like this:
# A tibble: 4 x 4
id read1 read2 matchflag
<dbl> <chr> <chr> <dbl>
1 1 CR PR 0
2 2 SD SD 1
3 3 PR PR 1
4 4 CR SD 0
A data.table option
dcast(
setDT(df),
id ~ paste0("reader", rowid(id)),
value.var = "response"
)[
,
match_flag := +(reader1 == reader2)
][]
gives
id reader1 reader2 match_flag
1: 1 CR PR 0
2: 2 SD SD 1
3: 3 PR PR 1
4: 4 CR SD 0
This should work:
dat <- data.frame(id, reader, response)
dat %>%
select(-reader) %>%
group_by(id) %>%
mutate(obs = seq_along(id)) %>%
pivot_wider(names_from="obs", values_from="response", names_prefix="read") %>%
mutate(match_flag = as.numeric(read1 == read2))
# # A tibble: 4 x 4
# # Groups: id [4]
# id read1 read2 match_flag
# <dbl> <chr> <chr> <dbl>
# 1 1 CR PR 0
# 2 2 SD SD 1
# 3 3 PR PR 1
# 4 4 CR SD 0
A slight change from #DaveArmstrong's solution is also by creating the row sequence with rowid (from data.table, and then pivot to wide format and create the new column by using a relational operator and coerce to binary with +
library(dplyr)
library(tidyr)
library(data.table)
dat %>%
transmute(id, obs = rowid(id), response) %>%
pivot_wider(names_from = obs,values_from = response, names_prefix = 'read') %>%
mutate(match_flag = +(read1 == read2))
# A tibble: 4 x 4
# id read1 read2 match_flag
# <dbl> <chr> <chr> <int>
#1 1 CR PR 0
#2 2 SD SD 1
#3 3 PR PR 1
#4 4 CR SD 0

How to check whether a particular flag is present for a group in r?

I have the following dataframe, I'm not sure how I would do it in dplyr.
df_test <- data.frame(group=c("A","A","B","C","C"), var=c("1","1","1","2","3"),var2=c("a","a","b","c","c"),flag=c("1","0","0","1","0"))
I want to know for each the variable group and var whether there was a flag and the distinct value of var 2 it corresponds to?
df_result<-data.frame(group=c("A","B","C","C"),var=c("1","1","2","3"),flag_yes=c("1","0","1","0"),var2_distinct=c("a","na","c","na"))
We can group by 'group', 'var', check any 'flag' is 1 and paste the 'var2' elements that corresponds to 'flag' 1 and change the "" to NA with na_if
library(dplyr)
df_test %>%
group_by(group, var) %>%
summarise(flag_yes = +(any(flag == "1")),
var2_distinct = na_if(toString(var2[flag == "1"]), ""))
# A tibble: 4 x 4
# Groups: group [3]
# group var flag_yes var2_distinct
# <fct> <fct> <int> <chr>
#1 A 1 1 a
#2 B 1 0 <NA>
#3 C 2 1 c
#4 C 3 0 <NA>
If we need the number of distinct elements as well
df_test %>%
group_by(group, var) %>%
summarise(flag_yes = +(any(flag == "1")),
var2_distinct = na_if(toString(var2[flag == "1"]), ""),
num_distinct = n_distinct(var2[flag == "1"]))
# A tibble: 4 x 5
# Groups: group [3]
# group var flag_yes var2_distinct num_distinct
# <fct> <fct> <int> <chr> <int>
#1 A 1 1 a 1
#2 B 1 0 <NA> 0
#3 C 2 1 c 1
#4 C 3 0 <NA> 0
Or using data.table
library(data.table)
setDT(df_test)[, .(flag_yes = +(any(flag == "1")),
var2_dstinct = na_if(toString(var2[flag == "1"]), "")), .(group, var)]
# group var flag_yes var2_dstinct
#1: A 1 1 a
#2: B 1 0 <NA>
#3: C 2 1 c
#4: C 3 0 <NA>
We can group_by group and var, create a flag_yes if there is any flag as 1 and get the corresponding var2 value.
library(dplyr)
df_test %>%
group_by(group, var) %>%
summarise(flag_yes = as.integer(any(flag == 1)),
var2_distinct = toString(var2[flag == 1]))
# group var flag_yes var2_distinct
# <fct> <fct> <int> <chr>
#1 A 1 1 "a"
#2 B 1 0 ""
#3 C 2 1 "c"
#4 C 3 0 ""
Base R Solution:
data.frame(do.call("rbind", lapply(split(df_test, paste0(df_test$group, df_test$var)),
function(x){
x$flag_yes <- ifelse(sum(as.integer(x$flag)) >= 1, 1, 0)
x$var2_distinct <- as.character(ifelse(x$flag == 1, as.character(x$var2), ""))
return(x)
}
)
),
row.names = NULL
)

R dplyr::Filter dataframe by group and numeric vector?

I have dataframe df1 containing data and groups, and df2 which stores the same groups, and one value per group.
I want to filter rows of df1 by df2 where lag by group is higher than indicated value.
Dummy example:
# identify the first year of disturbance by lag by group
df1 <- data.frame(year = c(1:4, 1:4),
mort = c(5,16,40,4,5,6,10,108),
distance = rep(c("a", "b"), each = 4))
df2 = data.frame(distance = c("a", "b"),
my.median = c(12,1))
Now calculate the lag between values (creates new column) and filter df1 based on column values of df2:
# calculate lag between years
df1 %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
filter(yearLag > df2$my.median) ##
This however does not produce expected results:
# A tibble: 3 x 4
# Groups: distance [2]
year mort distance yearLag
<int> <dbl> <fct> <dbl>
1 2 16 a 11
2 3 40 a 24
3 4 108 b 98
Instead, I expect to get:
# A tibble: 3 x 4
# Groups: distance [2]
year mort distance yearLag
<int> <dbl> <fct> <dbl>
1 3 40 a 24
2 1 5 b 5
3 3 10 b 4
The filter works great while applied to single value, but how to adapt it to vector, and especially vector of groups (as the order of elements can potentially change?)
Is this what you're trying to do?
df1 %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
left_join(df2) %>%
filter(yearLag > my.median)
Result:
# A tibble: 4 x 5
# Groups: distance [2]
year mort distance yearLag my.median
<int> <dbl> <fct> <dbl> <dbl>
1 3 40 a 24 12
2 1 5 b 5 1
3 3 10 b 4 1
4 4 108 b 98 1
here is a data.table approach
library( data.table )
#creatae data.tables
setDT(df1);setDT(df2)
#create yearLag variable
df1[, yearLag := mort - shift( mort, type = "lag", fill = 0 ), by = .(distance) ]
#update join and filter wanted rows
df1[ df2, median.value := i.my.median, on = .(distance)][ yearLag > median.value, ][]
# year mort distance yearLag median.value
# 1: 3 40 a 24 12
# 2: 1 5 b 5 1
# 3: 3 10 b 4 1
# 4: 4 108 b 98 1
Came to the same conclusion. You should left_join the data frames.
df1 %>% left_join(df2, by="distance") %>%
group_by(distance) %>%
dplyr::mutate(yearLag = mort - lag(mort, default = 0)) %>%
filter(yearLag > my.median)
# A tibble: 4 x 5
# Groups: distance [2]
year mort distance my.median yearLag
<int> <dbl> <fct> <dbl> <dbl>
1 3 40 a 12 24
2 1 5 b 1 5
3 3 10 b 1 4
4 4 108 b 1 98

Removing mirrored combinations of variables in a data frame

I'm looking to get each unique combination of two variables:
library(purrr)
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`)
# A tibble: 6 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 1 2
4 3 2
5 1 3
6 2 3
How do I remove out the mirrored combinations? That is, I want only one of rows 1 and 3 in the data frame above, only one of rows 2 and 5, and only one of rows 4 and 6. My desired output would be something like:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2
I don't care if a particular id value is in id1 or id2, so the below is just as acceptable as the output:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 1 2
2 1 3
3 2 3
A tidyverse version of Dan's answer:
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`) %>%
mutate(min = pmap_int(., min), max = pmap_int(., max)) %>% # Find the min and max in each row
unite(check, c(min, max), remove = FALSE) %>% # Combine them in a "check" variable
distinct(check, .keep_all = TRUE) %>% # Remove duplicates of the "check" variable
select(id1, id2)
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2
A Base R approach:
# create a string with the sorted elements of the row
df$temp <- apply(df, 1, function(x) paste(sort(x), collapse=""))
# then you can simply keep rows with a unique sorted-string value
df[!duplicated(df$temp), 1:2]

Resources