r retain rows only if non missing in specific column - r

My dataframe has columns and rows like this
Id Date Col1 Col2 Col3 X1
1 1/1/22 NA 1 0
1 1/1/22 0 0 1 6
2 5/7/21 0 1 0
2 5/7/21 0 2 0
I like to drop rows where the duplicate row (same Id, same date) where values for column X1 is missing or empty. If both the rows are missing X1 for that ID and date then dont drop. Only when one is missing and other is not missing then drop the missing row.
Expected output
Id Date Col1 Col2 Col3 X1
1 1/1/22 0 0 1 6
2 5/7/21 0 1 0
2 5/7/21 0 2 0
I tried this
library(tidyr)
df %>%
group_by(Id, Date) %>%
drop_na(X1)
This drops all rows with NA or missing and I am just left with one row, which is not what I want. Any suggestions much apricated. Thanks.

We can create a condition in filter to return all the rows if there are only missing values in 'X1' or just remove the missing rows
library(dplyr)
df %>%
group_by(Id, Date) %>%
filter(if(all(is.na(X1))) TRUE else complete.cases(X1)) %>%
ungroup
-output
# A tibble: 3 × 6
Id Date Col1 Col2 Col3 X1
<int> <chr> <int> <int> <int> <int>
1 1 1/1/22 0 0 1 6
2 2 5/7/21 0 1 0 NA
3 2 5/7/21 0 2 0 NA
Or without the if/else, use | with & condition
df %>%
group_by(Id, Date) %>%
filter(any(complete.cases(X1)) & complete.cases(X1) |
all(is.na(X1))) %>%
ungroup
data
df <- structure(list(Id = c(1L, 1L, 2L, 2L), Date = c("1/1/22", "1/1/22",
"5/7/21", "5/7/21"), Col1 = c(NA, 0L, 0L, 0L), Col2 = c(1L, 0L,
1L, 2L), Col3 = c(0L, 1L, 0L, 0L), X1 = c(NA, 6L, NA, NA)),
class = "data.frame", row.names = c(NA,
-4L))

Related

Calculate difference between index date and date with first indicator

Suppose I have the following data frame with an index date and follow up dates with a "1" as a stop indicator. I want to input the date difference in days into the index row and if no stop indicator is present input the number of days from the index date to the last observation:
id date group indicator
1 15-01-2022 1 0
1 15-01-2022 2 0
1 16-01-2022 2 1
1 20-01-2022 2 0
2 18-01-2022 1 0
2 20-01-2022 2 0
2 27-01-2022 2 0
Want:
id date group indicator stoptime
1 15-01-2022 1 0 NA
1 15-01-2022 2 0 NA
1 16-01-2022 2 1 1
1 20-01-2022 2 0 NA
2 18-01-2022 1 0 NA
2 20-01-2022 2 0 NA
2 27-01-2022 2 0 9
Convert the 'date' to Date class, grouped by 'id', find the position of 1 from 'indicator' (if not found, use the last position -n()), then get the difference of 'date' from the first to that position in days
library(dplyr)
library(lubridate)
df1 %>%
mutate(date = dmy(date)) %>%
group_by(id) %>%
mutate(ind = match(1, indicator, nomatch = n()),
stoptime = case_when(row_number() == ind ~
as.integer(difftime(date[ind], first(date), units = "days"))),
ind = NULL) %>%
ungroup
-output
# A tibble: 7 × 5
id date group indicator stoptime
<int> <date> <int> <int> <int>
1 1 2022-01-15 1 0 NA
2 1 2022-01-15 2 0 NA
3 1 2022-01-16 2 1 1
4 1 2022-01-20 2 0 NA
5 2 2022-01-18 1 0 NA
6 2 2022-01-20 2 0 NA
7 2 2022-01-27 2 0 9
data
df1 <- structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), date = c("15-01-2022",
"15-01-2022", "16-01-2022", "20-01-2022", "18-01-2022", "20-01-2022",
"27-01-2022"), group = c(1L, 2L, 2L, 2L, 1L, 2L, 2L), indicator = c(0L,
0L, 1L, 0L, 0L, 0L, 0L)), class = "data.frame",
row.names = c(NA,
-7L))

R: How to identify first date of positive observation by ID for multiple columns?

I would like to identify first date of positive observation by ID for multiple columns.
Example dataframe:
ID date Observ1 Observ2 Observ3
1 1 1 0 0
1 2 0 1 0
1 3 1 0 1
2 1 1 1 0
Desired result:
ID FirstObserv1 FirstObserv2 FirstObserv3
1 1 2 3
2 1 1 NA
For single column of observation, I can solve it with dplyr:
df %>% group_by(ID) %>% filter( Observ1 > 0) %>% summarize( FirstObserv1 = min(date) ) %>% as.data.frame()
Having no idea how to do it for multiple column at once, though.
Try reshaping your data like this using tidyverse functions. The key of the code id filtering those values with value of 1 and then set a filter to extract the min date value using filter(). After that you reshape to wide and you get the expected output. Here the code:
library(tidyverse)
#Code
dfnew <- df %>% pivot_longer(-c(ID,date)) %>%
group_by(ID) %>%
filter(value==1) %>% select(-value) %>% ungroup() %>%
group_by(ID,name) %>%
filter(date==min(date)) %>%
pivot_wider(names_from = name,values_from=date)
Output:
# A tibble: 2 x 4
# Groups: ID [2]
ID Observ1 Observ2 Observ3
<int> <int> <int> <int>
1 1 1 2 3
2 2 1 1 NA
Some data used:
#Data
df <- structure(list(ID = c(1L, 1L, 1L, 2L), date = c(1L, 2L, 3L, 1L
), Observ1 = c(1L, 0L, 1L, 1L), Observ2 = c(0L, 1L, 0L, 1L),
Observ3 = c(0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-4L))
Here's a method which just replaces the observation with date if the observation is positive and NA otherwise. Getting the min of each observation yields the desired results.
df %>%
mutate_at(vars(starts_with("Observ")), ~ifelse(. > 0, date, NA)) %>%
group_by(ID) %>%
summarise_at(vars(starts_with("Observ")), min, na.rm = TRUE)
#> # A tibble: 2 x 4
#> ID Observ1 Observ2 Observ3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 3
#> 2 2 1 1 Inf
Another alternative:
df %>%
group_by(ID) %>%
summarise(across(
-date,
list(First = ~{x <- which(. > 0); if (length(x) > 0L) date[[x[[1L]]]] else NA_real_}),
.names = "{.fn}{.col}"
))
Output
ID FirstObserv1 FirstObserv2 FirstObserv3
<dbl> <dbl> <dbl> <dbl>
1 1 1 2 3
2 2 1 1 NA
We can use data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) which(x > 0)[1]),
ID, .SDcols = patterns('^Observ')]
# ID Observ1 Observ2 Observ3
#1: 1 1 2 3
#2: 2 1 1 NA
Or using tidyverse
library(dplyr)
df %>%
group_by(ID) %>%
summarise(across(starts_with('Obser'), ~ which(. > 0)[1],
.names = 'First{col}'), .groups = 'drop')
# A tibble: 2 x 4
# ID FirstObserv1 FirstObserv2 FirstObserv3
# <int> <int> <int> <int>
#1 1 1 2 3
#2 2 1 1 NA
data
df <- structure(list(ID = c(1L, 1L, 1L, 2L), date = c(1L, 2L, 3L, 1L
), Observ1 = c(1L, 0L, 1L, 1L), Observ2 = c(0L, 1L, 0L, 1L),
Observ3 = c(0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-4L))

How I can make a column with some indicator?

I have 4 indicator and an id row. How I can make a column with respect of this indicator like this:
ID. col1. col2. col3. col4.
1 1 0 0 0
2 0 0 0 1
3 0 0 1 0
I n each row just one of the columns is 1 and others 0. new column is 1 if col1 is 1, is 2 if col2 is 1 , is 3 if col3 is 1 and is 4 if col4 is 1.
so the output is
ID. col.
1 1
2 4
3 3
An option is max.col
cbind(df1[1], `col.` = max.col(df1[-1], "first"))
# ID. col.
#1 1 1
#2 2 4
#3 3 3
If there are no 1s in the rows, create a logical condition to return that row as NA
df1[2, 5] <- 0
cbind(df1[1], `col.` = max.col(df1[-1], "first") * NA^ !rowSums(df1[-1] == 1))
data
df1 <- structure(list(ID. = 1:3, col1. = c(1L, 0L, 0L), col2. = c(0L,
0L, 0L), col3. = c(0L, 0L, 1L), col4. = c(0L, 1L, 0L)),
class = "data.frame", row.names = c(NA,
-3L))

Subset data from the last time a condition is met till the last row of a data frame - applied to each subject

I have a data frame like this:
ID TIME AMT CONC
1 0 10 2
1 1 0 1
1 5 20 15
1 10 0 30
1 12 0 16
I want to subset data for each subject ID, from the last time when AMT > 0 till the last row of the data frame for that individual.
output should be this:
ID TIME AMT CONC
1 5 20 15
1 10 0 30
1 12 0 16
I am using RStudio.
We can use slice and create a sequence between the max index where AMT > 0 and the last index for each ID.
library(dplyr)
df %>%
group_by(ID) %>%
slice(max(which(AMT > 0)) : n())
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
We can use filter
library(dplyr)
df %>%
group_by(ID) %>%
mutate(ind = cumsum(AMT > 0)) %>%
filter(ind == max(ind), ind > 0) %>%
select(-ind)
# A tibble: 3 x 4
# Groups: ID [1]
# ID TIME AMT CONC
# <int> <int> <int> <int>
#1 1 5 20 15
#2 1 10 0 30
#3 1 12 0 16
NOTE: This also works well when all the elements of 'AMT' is 0 for a particular group
df$ID[4:5] <- 2
df$AMT <- 0
df$AMT[4:5] <- c(1, 0)
Or another option is fewer steps
df %>%
group_by(ID) %>%
filter(row_number() >= which.max(cumsum(AMT > 0)))
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L), TIME = c(0L, 1L, 5L,
10L, 12L), AMT = c(10L, 0L, 20L, 0L, 0L), CONC = c(2L, 1L, 15L,
30L, 16L)), class = "data.frame", row.names = c(NA, -5L))

Grouping by a column and counting number of positive and negative values corresponding to each value in R

I want to have a list of positive and negative values corresponding to each value that comes after grouping a column. My data looks like this:
dataset <- read.table(text =
"id value
1 4
1 -2
1 0
2 6
2 -4
2 -5
2 -1
3 0
3 0
3 -4
3 -5",
header = TRUE, stringsAsFactors = FALSE)
I want my result to look like this:
id num_pos_value num_neg_value num_zero_value
1 1 1 1
2 1 3 0
3 0 2 2
I want to extend the columns of the above result by adding sum of the positive and negative values.
id num_pos num_neg num_zero sum_pos sum_neg
1 1 1 1 4 -2
2 1 3 0 6 -10
3 0 2 2 0 -9
We create a group by 'id' and calculate the sum of logical vector
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(num_pos = sum(value > 0),
num_neg = sum(value < 0),
num_zero = sum(value == 0))
# A tibble: 3 x 4
# id num_pos num_neg num_zero
# <int> <int> <int> <int>
#1 1 1 1 1
#2 2 1 3 0
#3 3 0 2 2
Or get the table of sign of 'value' and spread it to 'wide'
library(tidyr)
df1 %>%
group_by(id) %>%
summarise(num = list(table(factor(sign(value), levels = -1:1)))) %>%
unnest %>%
mutate(grp = rep(paste0("num", c("pos", "zero", "neg")), 3)) %>%
spread(grp, num)
Or using count
df1 %>%
count(id, val = sign(value)) %>%
spread(val, n, fill = 0)
data
df1 <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L), value = c(4L, -2L, 0L, 6L, -4L, -5L, -1L, 0L, 0L, -4L, -5L
)), class = "data.frame", row.names = c(NA, -11L))

Resources