Filter by group and conditions

Filter by group and conditions - r

I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)

Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA

Related

Join similar observations within a data.frame with R

I want to mix several observations in a data.frame using as a reference one constantly repeated variable.
Example:
id var1 var2 var3
a 1 na na
a na 2 na
a na na 3
b 1 na
b na 2 na
b na na na
c na na 3
c na 2 na
c 1 na na
Expected result:
id var1 var2 var3
a 1 2 3
b 1 2 na
c 1 2 3

A possible solution (replacing "na" by NA with na_if):
library(tidyverse)
df %>%
na_if("na") %>%
group_by(id) %>%
summarize(across(var1:var3, ~ sort(.x)[1]))
#> # A tibble: 3 × 4
#> id var1 var2 var3
#> <chr> <chr> <chr> <chr>
#> 1 a 1 2 3
#> 2 b 1 2 <NA>
#> 3 c 1 2 3

Assumptions:
"na" above is really R's native NA (not a string);
b's first row, var2 should be NA instead of an empty string ""
perhaps from the above, var1:var3 should be numbers
either you will never have a group where there is more than one non-NA in a group/column, or you don't care about anything other than the first and want the remaining discarded
library(dplyr)
dat %>%
group_by(id) %>%
summarize(across(everything(), ~ na.omit(.)[1]))
# # A tibble: 3 x 4
# id var1 var2 var3
# <chr> <int> <int> <int>
# 1 a 1 2 3
# 2 b 1 2 NA
# 3 c 1 2 3
Data
dat <- structure(list(id = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), var1 = c(1L, NA, NA, 1L, NA, NA, NA, NA, 1L), var2 = c(NA, 2L, NA, NA, 2L, NA, NA, 2L, NA), var3 = c(NA, NA, 3L, NA, NA, NA, 3L, NA, NA)), class = "data.frame", row.names = c(NA, -9L))

Assuming that your data has NA, you can use the following base R option using the Data from #r2evans (thanks!):
aggregate(.~id, dat, mean, na.rm = TRUE, na.action=NULL)
Output:
id var1 var2 var3
1 a 1 2 3
2 b 1 2 NaN
3 c 1 2 3

how to recode dummy column with the column name?

I have the input dataset, and I'm looking for generating the output dataset by recoding 1 as the name of the columns and 0 as NA. I managed to do it manually see Not optional solution below. But I have a dataset with hundreds of columns, so I'm looking for a way to automatize this process.
Packages
library(tibble)
library(dplyr)
Input
input <- tibble( a = c(1, 0, 0, 1, 0),
b = c(0, 0, 0, 1, 1),
c = c(1, 1, 1, 1, 1),
d = c(0, 0, 0, 0, 0))
# # A tibble: 5 × 4
# a b c d
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 1 0
# 2 0 0 1 0
# 3 0 0 1 0
# 4 1 1 1 0
# 5 0 1 1 0
Output
output <- tibble( a = c("a", NA, NA, "a", NA),
b = c(NA, NA, NA, "b", NA),
c = c("c", "c", "c", "c", "c"),
d = c(NA, NA, NA, NA, NA))
# # A tibble: 5 × 4
# a b c d
# <chr> <chr> <chr> <lgl>
# 1 a NA c NA
# 2 NA NA c NA
# 3 NA NA c NA
# 4 a b c NA
# 5 NA NA c NA
Not optional solution
input %>%
mutate(a = case_when(a == 1 ~ "a",
T ~ NA_character_),
b = case_when(b == 1 ~ "b",
T ~ NA_character_),
c = case_when(c == 1 ~ "c",
T ~ NA_character_),
d = case_when(d == 1 ~ "d",
T ~ NA_character_))

We could use across with an ifelse statement:
library(dplyr)
input %>%
mutate(across(everything(), ~ifelse(. == 1, cur_column(), NA)))
a b c d
<chr> <chr> <chr> <lgl>
1 a NA c NA
2 NA NA c NA
3 NA NA c NA
4 a b c NA
5 NA b c NA

Extract non-missing elements by rows and stack them

I have a data frame like this
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = F)
# id V1 V2 V3 V4
# 1 1 A <NA> <NA> <NA>
# 2 2 <NA> <NA> B <NA>
# 3 3 C <NA> <NA> D
# 4 4 <NA> E F <NA>
How can I extract non-missing elements by rows and stack them into a column? My expected output is:
# id value
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F

Try pivot_longer() or unite() + separate_rows().
library(tidyr)
library(dplyr)
# Method 1
df %>%
pivot_longer(-id, values_drop_na = T) %>%
select(-name)
# Method 2
df %>%
unite(value, -id, na.rm = T) %>%
separate_rows(value)
# # A tibble: 6 x 2
# id value
# <int> <chr>
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F

You can use dplyr and tidyr:
df %>%
tidyr::gather(-id, key = "key", value = "value") %>%
dplyr::filter(!is.na(value))
id key value
1 1 V1 A
2 3 V1 C
3 4 V2 E
4 2 V3 B
5 4 V3 F
6 3 V4 D

One base R solution could be:
na.omit(data.frame(df[1], stack(df[-1])[1]))
id values
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D

How about combining complete.cases with reshape library?
library(reshape2)
df.temp <- melt(df, id.vars = "id")
df.temp[complete.cases(df.temp),-2]
results in
id value
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D

pivot_longer then filter
library(tidyverse)
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = FALSE)
df %>% pivot_longer(-id, names_to = "name", values_to = "value") %>%
filter(!is.na(value)) %>%
select(-name)
#> # A tibble: 6 x 2
#> id value
#> <int> <chr>
#> 1 1 A
#> 2 2 B
#> 3 3 C
#> 4 3 D
#> 5 4 E
#> 6 4 F
Created on 2020-03-02 by the reprex package (v0.3.0)

Merge multiple rows based on single column value

I am having following table:
A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where
I want my output table to be:
A B C D E
1 hi we are here
2 u NA are where
I have tried the following :
my_fun <- function(x) x[!is.na(x)]
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
it gives error:
Error in summarise_impl(.data, dots) : Column E must be length 1
(a summary value), not 3
Can anyone help me to achieve required data frame.

You can modify your function in the following way:
my_fun <- function(x) {
if_else(any(!is.na(x)), na.exclude(x)[1], NA_character_)
}
First it checks whether there are any non-missing values and returns the first non-missing value and NA otherwise.
If you only use the funciton once you could also do:
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(if_else(any(!is.na(.)), na.exclude(.)[1], NA_character_)))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Or you can use the condition in a summarise_if-statement:
buildingCopy %>%
add_row(A = 2, B = "u", C = NA_character_, D = "are", E = "where") %>%
group_by(A) %>%
summarise_if(funs(any(!is.na(.))), funs(na.exclude(.)[1]))
# A tibble: 2 x 5
# A B C D E
# <dbl> <chr> <chr> <chr> <chr>
# 1 1 hi we are here
# 2 2 u NA are where
Data
buildingCopy <- structure(list(A = c(1L, 1L, 1L, 2L),
B = c(NA, "hi", NA, "u"),
C = c("we", "we", NA, NA),
D = c("are", NA, "are", "are"),
E = c("here", "here", "there", "where")),
class = "data.frame", row.names = c(NA, -4L))

The base R function na.omit() can be used here
library(dplyr)
my_fun <- function(x) na.omit(x) %>% first()
buildingCopy %>%
group_by(A) %>%
summarise_all(funs(my_fun))
# A tibble: 2 x 5
A B C D E
<int> <chr> <chr> <chr> <chr>
1 1 hi we are here
2 2 u NA are where
Data
buildingCopy <- readr::read_table(
"A B C D E
1 NA we are here
1 hi we NA here
1 NA NA are there
2 u NA are where")

Last observation carried forward conditional on multiple columns

I have a dataset with this structure:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
L40 = c(1, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA)
K50 = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1)
df = data.frame(ID, L40, K50)
# ID L40 K50
# 1 1 1 NA
# 2 1 NA NA
# 3 1 NA NA
# 4 1 NA NA
# 5 2 1 NA
# 6 2 NA 1
# 7 2 NA NA
# 8 3 NA NA
# 9 3 1 NA
# 10 3 NA NA
# 11 3 NA 1
When missing values occur in columns L40 and K50, I want to carry forward the last non-missing value in that column, conditional on ID being the same as the previous ID and the values in L40 and K50 in the current row being empty. I applied the following code:
library(tidyr)
df2 <- df %>% group_by(ID) %>% fill(L40:K50)
This does not achieve what I am looking for. I want the previous non-missing value to be carried forward into the next row only when the other columns (except ID) in that row are empty. This is what I want:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
L40 = c(1, 1, 1, 1, 1, NA, NA, NA, 1, 1, NA)
K50 = c(NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, 1)
df3 = data.frame(ID, L40, K50)
df3
# ID L40 K50
# 1 1 1 NA
# 2 1 1 NA
# 3 1 1 NA
# 4 1 1 NA
# 5 2 1 NA
# 6 2 NA 1
# 7 2 NA 1
# 8 3 NA NA
# 9 3 1 NA
# 10 3 1 NA
# 11 3 NA 1

We can use na.locf
library(data.table)
library(zoo)
setDT(df)[, if(any(is.na(K50[-1]))) lapply(.SD, na.locf) else .SD , by = ID]
# ID L40 K50
#1: 1 1 NA
#2: 1 1 NA
#3: 1 1 NA
#4: 1 1 NA
#5: 2 1 NA
#6: 2 NA 1
#7: 3 NA 1
#8: 3 NA 1
#9: 3 NA 1
An option using dplyr would be
library(dplyr)
df %>%
mutate(ind = rowSums(is.na(.))) %>%
group_by(ID) %>%
mutate_each(funs(if(any(ind>1)) na.locf(., na.rm=FALSE) else .), L40:K50) %>%
select(-ind)
# ID L40 K50
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 1 1 NA
#3 1 1 NA
#4 1 1 NA
#5 2 1 NA
#6 2 NA 1
#7 3 NA 1
#8 3 NA 1
#9 3 NA 1

I played around with this question for a while, and with my limited knowledge of R I came up with the following work-around. I have added a date column to the original data frame for purpose of illustration:
ID = c(1,1,1,1,2,2,2,3,3,3,3)
date = c(1,2,3,4,1,2,3,1,2,3,4)
L40 = c(1, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA)
K50 = c(NA, 1, 1, NA, NA, 1, NA, NA, NA, NA, 1)
df = data.frame(ID, date, L40, K50)
Here is what I did:
#gather the diagnosis columns in rows and keep only those rows where the patient has the associated diagnosis.
df1 <- df %>% gather(diagnos, dummy, L40:K50) %>% filter(dummy==1) %>% arrange(ID, date)
#concatenate across rows by ID and date to collect all diagnoses of an ID at a particular date.
df2 <- df1 %>% group_by(ID, date) %>% mutate(diag = paste(diagnos, collapse=" ")) %>% select(-diagnos, -dummy)
#convert into data tables in preparation for join
Dt1 <- data.table(df)
Dt2 <- data.table(df2)
setkey(Dt1, ID, date)
setkey(Dt2, ID, date)
#Each observation in Dt1 is matched with the observation in Dt1 with the same date or, if that particular date is not present,
#by the nearest previous date:
final <- Dt2[Dt1, roll=TRUE] %>% distinct()
This carries forward the name(s) of the diagnosis until the next observed diagnosis.