Impute missing values with a value from previous month (if exists) - r

I have a dataframe with more than 100 000 rows and 30 000 unique ids.
My aim is to fill all the NAs among the different columns if there is a value from the previous month and the same id. However, most of the times the previous recorded value is from more than a month ago. Those NAs I would like to leave untouched.
The id column and the date column do not have NAs.
Here is an example of the data I have:
df3
id oxygen gluco dias bp date
1 0,25897842 0,20201604 0,17955655 0,14100962 31.7.2019
2 NA NA 0,38582622 0,12918231 31.12.2014
2 0,35817147 0,32943499 NA 0,43667462 30.11.2018
2 0,68557053 0,42898807 0,93897514 NA 31.10.2018
2 NA NA 0,99899076 0,44168223 31.7.2018
2 0,43848054 0,38604586 NA NA 30.4.2013
2 0,15823254 0,06216771 0,07829624 0,69755251 31.1.2016
2 NA NA 0,61645303 NA 29.2.2016
2 0,94671363 0,50682091 0,96770222 0,97403356 31.5.2018
3 NA 0,77352235 0,660479 0,11554399 30.4.2019
3 0,15567703 NA 0,4553325 NA 31.3.2017
3 NA NA 0,22181609 0,08527658 30.9.2017
3 0,93660763 NA NA NA 31.3.2018
3 0,73416759 NA NA 0,78501791 30.11.2018
3 NA NA NA NA 28.2.2019
3 0,84525106 0,54360374 NA 0,40595426 31.8.2014
3 0,76221263 0,62983336 0,84592719 0,10640734 31.8.2013
4 NA 0,29108942 0,3863479 NA 31.1.2018
4 0,74075742 NA 0,38117415 0,58849266 30.11.2018
4 0,09400641 0,68860814 NA 0,88895224 31.8.2014
4 0,72202944 0,49901387 0,19967415 NA 31.8.2018
4 0,98205262 0,85213969 0,34450998 0,98962306 30.11.2013
This is the last code implementation that I have tried:
´´´
df3 %>%
group_by(id) %>%
mutate_all(funs(na.locf(., na.rm = FALSE, maxgap = 30)))
´´´
But apparently "mutate_all() ignored the following grouping variables:
Column id"

You can use the tidyverse for that. Here's an approach:
Change the date column to class Date, then order by date
Prepare the dates and remove the days in Ym
get the time difference in mo
flag the rows which have max one month difference
get groups by cumsum the inverse logic in flag
fill the rows from the same groups
library(dplyr)
library(tidyr)
library(lubridate)
df$date <- as.Date(df$date, format="%d.%m.%Y")
df %>%
arrange(date) %>%
mutate(
Ym = ym(strftime(date, "%Y-%m")),
mo = interval(Ym, lag(Ym, default=as.Date("1970-01-01"))) / months(1),
flag = cumsum(!(mo > -2 & mo < 1))) %>%
group_by(id, flag) %>%
fill(names(.), .direction="down") %>%
ungroup() %>%
select(-c("Ym","mo","flag")) %>%
print(n=nrow(.))
Output
# A tibble: 22 × 6
id oxygen gluco dias bp date
<int> <chr> <chr> <chr> <chr> <date>
1 2 0,43848054 0,38604586 NA NA 2013-04-30
2 3 0,76221263 0,62983336 0,84592719 0,10640734 2013-08-31
3 4 0,98205262 0,85213969 0,34450998 0,98962306 2013-11-30
4 3 0,84525106 0,54360374 NA 0,40595426 2014-08-31
5 4 0,09400641 0,68860814 NA 0,88895224 2014-08-31
6 2 NA NA 0,38582622 0,12918231 2014-12-31
7 2 0,15823254 0,06216771 0,07829624 0,69755251 2016-01-31
8 2 0,15823254 0,06216771 0,61645303 0,69755251 2016-02-29
9 3 0,15567703 NA 0,4553325 NA 2017-03-31
10 3 NA NA 0,22181609 0,08527658 2017-09-30
11 4 NA 0,29108942 0,3863479 NA 2018-01-31
12 3 0,93660763 NA NA NA 2018-03-31
13 2 0,94671363 0,50682091 0,96770222 0,97403356 2018-05-31
14 2 NA NA 0,99899076 0,44168223 2018-07-31
15 4 0,72202944 0,49901387 0,19967415 NA 2018-08-31
16 2 0,68557053 0,42898807 0,93897514 NA 2018-10-31
17 2 0,35817147 0,32943499 0,93897514 0,43667462 2018-11-30
18 3 0,73416759 NA NA 0,78501791 2018-11-30
19 4 0,74075742 NA 0,38117415 0,58849266 2018-11-30
20 3 NA NA NA NA 2019-02-28
21 3 NA 0,77352235 0,660479 0,11554399 2019-04-30
22 1 0,25897842 0,20201604 0,17955655 0,14100962 2019-07-31
Data
df <- structure(list(id = c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), oxygen = c("0,25897842",
NA, "0,35817147", "0,68557053", NA, "0,43848054", "0,15823254",
NA, "0,94671363", NA, "0,15567703", NA, "0,93660763", "0,73416759",
NA, "0,84525106", "0,76221263", NA, "0,74075742", "0,09400641",
"0,72202944", "0,98205262"), gluco = c("0,20201604", NA, "0,32943499",
"0,42898807", NA, "0,38604586", "0,06216771", NA, "0,50682091",
"0,77352235", NA, NA, NA, NA, NA, "0,54360374", "0,62983336",
"0,29108942", NA, "0,68860814", "0,49901387", "0,85213969"),
dias = c("0,17955655", "0,38582622", NA, "0,93897514", "0,99899076",
NA, "0,07829624", "0,61645303", "0,96770222", "0,660479",
"0,4553325", "0,22181609", NA, NA, NA, NA, "0,84592719",
"0,3863479", "0,38117415", NA, "0,19967415", "0,34450998"
), bp = c("0,14100962", "0,12918231", "0,43667462", NA, "0,44168223",
NA, "0,69755251", NA, "0,97403356", "0,11554399", NA, "0,08527658",
NA, "0,78501791", NA, "0,40595426", "0,10640734", NA, "0,58849266",
"0,88895224", NA, "0,98962306"), date = structure(c(18108,
16435, 17865, 17835, 17743, 15825, 16831, 16860, 17682, 18016,
17256, 17439, 17621, 17865, 17955, 16313, 15948, 17562, 17865,
16313, 17774, 16039), class = "Date")), row.names = c(NA,
-22L), class = "data.frame")

Related

Sort rows grouped by grep alphabetically

I have a dataframe with a row full of adverse events but also relationships of these adverse events to the procedure, like this:
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
Now I'd like to sort the adverse events alphabetically but of course take the "related", "likely related" rows with the individual adverse event rows, so I'd like to somehow group them first.
In this example it's always 3 rows, but let's assume it could be sometimes 2, 4 or 5 rows too (all except the adverse event rows containing "related" in the string/name though e.g. 'unlikely related').
I know, I can get the indices of the adverse event rows by
grep('related', df$adverse_event, invert = T) but I'm unsure how to use this to group the rows together before sorting them.
Edit: Beginning of the left column of the desired output:
expected_output_left_column <- data.frame(adverse_event = c(
"Ascites", "related", "likely related",
"Biliary leakage / occlusion / fistula", "related", "likely related" ) )
Thank you!
Another solution using base r and lead function from dplyr
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
You can do the following:
library(dplyr)
left_join(
df,
df %>%
filter(!grepl('related',adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
Output:
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 likely related NA 0 NA NA
4 Biliary leakage / occlusion / fistula NA NA 1 NA
5 related NA NA 2 NA
6 likely related NA NA 0 NA
7 Haemorrhage 4 2 1 2
8 related 4 3 4 4
9 likely related 0 0 1 1
10 Hyperbilirubinemia NA 1 NA NA
11 related NA 0 NA NA
12 likely related NA 2 NA NA
13 Liver abscess NA 1 4 NA
14 related NA 1 5 NA
15 likely related NA 0 1 NA
16 Other 3 11 5 NA
17 related 6 3 3 NA
18 likely related 1 7 2 NA
19 Pain 8 2 2 NA
20 related 4 4 5 NA
21 likely related 5 2 1 NA
22 Pleural effusion with drainage NA 1 NA NA
23 related NA 2 NA NA
24 likely related NA 1 NA NA
25 Pneumothorax NA 1 1 NA
26 related NA 1 1 NA
27 likely related NA 0 0 NA
28 Portal vein thrombosis NA NA 1 NA
29 related NA NA 1 NA
30 likely related NA NA 0 NA
31 Sepsis NA NA 1 NA
32 related NA NA 1 NA
33 likely related NA NA 0 NA
34 Subcapsular hematoma 3 1 NA NA
35 related 1 2 NA NA
36 likely related 3 0 NA NA
Note that this uses data.table::nafill().. A full data.table solution is as below:
library(data.table)
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
Add a "group" variable and sort
tmp=!grepl("related",df$adverse_event)
df$grp=cumsum(tmp)
df[order(match(df$grp,order(df$adverse_event[tmp]))),]
adverse_event grade_1 grade_2 grade_3 grade_4 grp
13 Ascites NA 1 NA NA 5
14 related NA 1 NA NA 5
15 likely related NA 0 NA NA 5
28 Biliary leakage / occlusion / fistula NA NA 1 NA 10
29 related NA NA 2 NA 10
30 likely related NA NA 0 NA 10
1 Haemorrhage 4 2 1 2 1
2 related 4 3 4 4 1
3 likely related 0 0 1 1 1
16 Hyperbilirubinemia NA 1 NA NA 6
17 related NA 0 NA NA 6
18 likely related NA 2 NA NA 6
19 Liver abscess NA 1 4 NA 7
20 related NA 1 5 NA 7
21 likely related NA 0 1 NA 7
4 Other 3 11 5 NA 2
5 related 6 3 3 NA 2
6 likely related 1 7 2 NA 2
7 Pain 8 2 2 NA 3
8 related 4 4 5 NA 3
9 likely related 5 2 1 NA 3
22 Pleural effusion with drainage NA 1 NA NA 8
23 related NA 2 NA NA 8
24 likely related NA 1 NA NA 8
25 Pneumothorax NA 1 1 NA 9
26 related NA 1 1 NA 9
27 likely related NA 0 0 NA 9
31 Portal vein thrombosis NA NA 1 NA 11
32 related NA NA 1 NA 11
33 likely related NA NA 0 NA 11
34 Sepsis NA NA 1 NA 12
35 related NA NA 1 NA 12
36 likely related NA NA 0 NA 12
10 Subcapsular hematoma 3 1 NA NA 4
11 related 1 2 NA NA 4
12 likely related 3 0 NA NA 4
Just to throw in another tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
# # A tibble: 36 × 5
# adverse_event grade_1 grade_2 grade_3 grade_4
# <chr> <int> <int> <int> <int>
# 1 Ascites NA 1 NA NA
# 2 related NA 1 NA NA
# 3 likely related NA 0 NA NA
# 4 Biliary leakage / occlusion / fistula NA NA 1 NA
# 5 related NA NA 2 NA
# 6 likely related NA NA 0 NA
# 7 Haemorrhage 4 2 1 2
# 8 related 4 3 4 4
# 9 likely related 0 0 1 1
# 10 Hyperbilirubinemia NA 1 NA NA
# 11 related NA 0 NA NA
# 12 likely related NA 2 NA NA
# 13 Liver abscess NA 1 4 NA
# 14 related NA 1 5 NA
# 15 likely related NA 0 1 NA
# 16 Other 3 11 5 NA
# 17 related 6 3 3 NA
# 18 likely related 1 7 2 NA
# 19 Pain 8 2 2 NA
# 20 related 4 4 5 NA
# 21 likely related 5 2 1 NA
# 22 Pleural effusion with drainage NA 1 NA NA
# 23 related NA 2 NA NA
# 24 likely related NA 1 NA NA
# 25 Pneumothorax NA 1 1 NA
# 26 related NA 1 1 NA
# 27 likely related NA 0 0 NA
# 28 Portal vein thrombosis NA NA 1 NA
# 29 related NA NA 1 NA
# 30 likely related NA NA 0 NA
# 31 Sepsis NA NA 1 NA
# 32 related NA NA 1 NA
# 33 likely related NA NA 0 NA
# 34 Subcapsular hematoma 3 1 NA NA
# 35 related 1 2 NA NA
# 36 likely related 3 0 NA NA
Explanation
mutate + fill: Label each adverse_event with the stem, i.e. re-label all related records with the corresponding event above.
Nest all columns, but keep the newly created grp column, which bears the name of the stem adverse event.
Sort the adverse event stems.
Unnest the rows again.
Remove the grp column.
An approach using rank. Using an extended data set with 4 entries for "Ascites".
library(dplyr)
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 related NA 1 NA NA
4 likely related NA 0 NA NA
5 Biliary leakage / occlusion / fistula NA NA 1 NA
6 related NA NA 2 NA
7 likely related NA NA 0 NA
8 Haemorrhage 4 2 1 2
9 related 4 3 4 4
10 likely related 0 0 1 1
11 Hyperbilirubinemia NA 1 NA NA
12 related NA 0 NA NA
13 likely related NA 2 NA NA
14 Liver abscess NA 1 4 NA
15 related NA 1 5 NA
16 likely related NA 0 1 NA
17 Other 3 11 5 NA
18 related 6 3 3 NA
19 likely related 1 7 2 NA
20 Pain 8 2 2 NA
21 related 4 4 5 NA
22 likely related 5 2 1 NA
23 Pleural effusion with drainage NA 1 NA NA
24 related NA 2 NA NA
25 likely related NA 1 NA NA
26 Pneumothorax NA 1 1 NA
27 related NA 1 1 NA
28 likely related NA 0 0 NA
29 Portal vein thrombosis NA NA 1 NA
30 related NA NA 1 NA
31 likely related NA NA 0 NA
32 Sepsis NA NA 1 NA
33 related NA NA 1 NA
34 likely related NA NA 0 NA
35 Subcapsular hematoma 3 1 NA NA
36 related 1 2 NA NA
37 likely related 3 0 NA NA
extended data
df <- structure(list(adverse_event = c("Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "related", "likely related", "Hyperbilirubinemia",
"related", "likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"), grade_1 = c(4L,
4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), grade_2 = c(2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L,
2L, 1L, 2L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L,
1L, 1L, 1L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA), grade_3 = c(1L,
4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L), grade_4 = c(2L, 4L, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
37L), class = "data.frame")
Here is a benchmark of the different suggestions if needed :
library(bench)
library(dplyr)
library(data.table)
library(tidyr)
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
paul_carteron <- function(df){
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
}
lang_tang_dplyr <- function(df){
left_join(
df,
df %>%
filter(!grepl('related', adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
}
lang_tang_databable <- function(df) {
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
}
andre_wilberg <- function(df){
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
}
thotal <- function(df){
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
}
results = bench::mark(
iterations = 1000, check = FALSE, time_unit = "s", filter_gc = FALSE,
paul_carteron = paul_carteron(df),
lang_tang_dplyr = lang_tang_dplyr(df),
lang_tang_databable = lang_tang_databable(df),
andre_wilberg = andre_wilberg(df),
thotal = thotal(df)
)
plot(results)

Delete/overwrite rows by partial matching

I need to check if rows are partially duplicated and delete/overwrite those where 2 columns match a different row where 3 values are present. one problem is, that the "real" dataframe contains a couple of list columns which makes some operations unfeasible. Best case would be if any row where a match can be found would be checked independently of column numbers - meaning only the row with the most columns having non NA values (out of all which include matching column values) is kept.
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA # this row has only 2 values which match values from row 11 but the last value is na
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14 # this row has values in all 3 columns
12 14 NA NA
13 8 11 15 # so does this row
14 16 NA NA
15 17 NA NA
16 18 NA NA
17 19 NA NA
18 20 NA NA
The result should be the same data frame - just without row 7 or where row 7 is overwritten by row 11.
This should be easy to do but for some reason i didn't manage it (except with a convoluted for loop that is hard to generalize should more columns be added at a later time). Is there a straight forward way to do this?
dput of above df:
structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L), o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA), o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)), row.names = c(NA, -18L), class = "data.frame")
If there is already an answer for something like this, please let me know.
I thought of using dplyr:
library(dplyr)
df %>%
mutate(rn = row_number(),
count_na = rowSums(across(o1:o3, is.na))) %>%
group_by(o1, o2) %>%
slice_min(count_na) %>%
arrange(rn) %>%
ungroup() %>%
select(o1:o3)
This returns
# A tibble: 17 x 3
o1 o2 o3
<int> <int> <int>
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 10 NA NA
8 12 NA NA
9 13 NA NA
10 5 9 14
11 14 NA NA
12 8 11 15
13 16 NA NA
14 17 NA NA
15 18 NA NA
16 19 NA NA
17 20 NA NA
This solution is based on the following ideas:
For every row we count the number of NAs in this row.
We group for o1 and o2 to create groups of data that belong together. Here is a possible flaw: perhaps it is a better approach to group by o1 only or do some other grouping. This depends on the structure of your data: should 1, <NA>, <NA> be overwritten by 1, 2, <NA>?
After grouping, we select the row with the smallest number of NAs.
Finally we do some clean up: removing the auxiliary columns, arranging the data and ungrouping.
A partial solution to detect the duplicates, it remains to specify which rows to delete, ran out of time. I've went ahead and "duplicated" a couple more rows.
df=read.table(text="
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14
12 14 NA NA
13 8 11 15
14 16 NA NA
15 7 1 2
16 18 NA NA
17 7 1 3
18 20 NA NA",h=T)
The main trick is to calculate a distance matrix and check which rows have a distance of zero, since dist will automatically estimate a pairwise distance, removing missing values.
tmp=as.matrix(dist(df))
diag(tmp)=NA
tmp[lower.tri(tmp)]=NA
tod=data.frame(which(tmp==0,arr.ind=T))
resulting in
row col
X7 7 11
X6 6 15
X6.1 6 17
Here's another way which considers all columns, should work with any number of columns and regardless of their names or positions
library(dplyr)
mydf <- structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L),
o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA),
o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)),
row.names = c(NA, -18L),
class = "data.frame")
columns <- names(mydf)
dummy_cols <- paste0(columns, "_dummy")
mydf %>%
# duplicate the dataframe
cbind(mydf %>% `names<-`(dummy_cols)) %>%
# arrange across all columns
arrange(across(columns)) %>%
# fill NAs downwards
tidyr::fill(dummy_cols, .direction = "down") %>%
# create a dummy ID
tidyr::unite(id_dummy, dummy_cols, sep = "") %>%
# group by the id
group_by(id_dummy) %>%
# get the first row of each
filter(row_number()==1) %>%
ungroup() %>%
select(columns)
P.S. also replaces 1 - NA - NA with 1 - 2 - NA and replaces 1 - NA - NA with 1 - NA - 3

Tackling missing data in R

I have encountered this problem in a project that I'm currently doing.
I have a sparse dataframe and I need to calculate the difference between the first and the last observation per row under some conditions:
Conditions:
If the row only contains NA's then the difference is 0.
If the row contains only 1 observation then the difference is 0.
If row elements (>= 2) are non-NA's then their difference is the difference between the first and the last (tail - head).
The dataframe that I have:
S1 S2 S3 S4 S5
1 NA NA NA NA NA
2 NA 3 NA 5 NA
3 1 NA NA NA 5
4 1 NA 2 NA 7
5 2 NA NA NA NA
6 NA NA 3 4 NA
7 NA NA 3 NA NA
The dataframe that I need:
S1 S2 S3 S4 S5 diff
1 NA NA NA NA NA 0
2 NA 3 NA 5 NA 2
3 1 NA NA NA 5 4
4 1 NA 2 NA 7 6
5 2 NA NA NA NA 0
6 NA NA 3 4 NA 1
7 NA NA 3 NA NA 0
What I've written up till now:
last_minus_first <- function(x, y = na.omit(x)) tail(y, 1) - y[1]
But it doesn't resolve for the fact if the row contains all NA's.
Any help would be much appreciated.
I would suggest using a defined function with apply(). Here the code:
#Data
df <- structure(list(S1 = c(NA, NA, 1L, 1L, 2L, NA, NA), S2 = c(NA,
3L, NA, NA, NA, NA, NA), S3 = c(NA, NA, NA, 2L, NA, 3L, 3L),
S4 = c(NA, 5L, NA, NA, NA, 4L, NA), S5 = c(NA, NA, 5L, 7L,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7"))
Code:
#Function
myown <- function(x)
{
#Check NA
i <- sum(!is.na(x))
#Compute
if(i<=1)
{
y <- 0
} else
{
#Detect positions
j1 <- max(which(!is.na(x)))
j2 <- min(which(!is.na(x)))
#Diff
y <- x[j1]-x[j2]
}
return(y)
}
#Apply function by row
df$NewVar <- apply(df,1,myown)
Output:
S1 S2 S3 S4 S5 NewVar
1 NA NA NA NA NA 0
2 NA 3 NA 5 NA 2
3 1 NA NA NA 5 4
4 1 NA 2 NA 7 6
5 2 NA NA NA NA 0
6 NA NA 3 4 NA 1
7 NA NA 3 NA NA 0
Here's an easier (in my mind) way to handle this, using rowwise from the dplyr package to do calculations along rows.
df %>%
dplyr::rowwise() %>%
dplyr::mutate(max_pop = max(which(!is.na(dplyr::c_across(S1:S5)))),
min_pop = min(which(!is.na(dplyr::c_across(S1:S5)))),
diff = tidyr::replace_na(dplyr::c_across()[max_pop] - dplyr::c_across()[min_pop], 0))
I've broken that mutate call down into the various parts to show what we're doing, but essentially, it goes across all columns in a row to find the last populated column (max_pop), the first populated column (min_pop) and then uses those values to retrieve the values therein.
You have to specify columns for max_pop and min_pop above because creating new interim columns affects the column indexing. c_across() defaults to using all columns, though, so you can actually do this all in one mutate call without specifying any columns.
df %>%
rowwise() %>%
mutate(diff = replace_na(c_across()[max(which(!is.na(c_across())))] - c_across()[min(which(!is.na(c_across())))], 0))
A vectorized option in base R would be to extract the values based on row/column index and then subtract
df1$NewVar <- df1[cbind(seq_len(nrow(df1)), max.col(!is.na(df1), 'last'))] -
df1[cbind(seq_len(nrow(df1)), max.col(!is.na(df1), 'first'))]
df1$NewVar[is.na(df1$NewVar)] <- 0
df1
# S1 S2 S3 S4 S5 NewVar
#1 NA NA NA NA NA 0
#2 NA 3 NA 5 NA 2
#3 1 NA NA NA 5 4
#4 1 NA 2 NA 7 6
#5 2 NA NA NA NA 0
#6 NA NA 3 4 NA 1
#7 NA NA 3 NA NA 0
data
df1 <- structure(list(S1 = c(NA, NA, 1L, 1L, 2L, NA, NA), S2 = c(NA,
3L, NA, NA, NA, NA, NA), S3 = c(NA, NA, NA, 2L, NA, 3L, 3L),
S4 = c(NA, 5L, NA, NA, NA, 4L, NA), S5 = c(NA, NA, 5L, 7L,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7"))

Dividing data into quintiles using group_by

I am looking for a way to change my way in such a way that it sorts the data into quintiles instead of the top 5 and bottom 5. My current code looks like this:
CombData <- CombData %>%
group_by(Date) %>%
mutate(
R=min_rank(Value),
E_P = case_when(
R < 6 ~ "5w",
R > max(R, na.rm =TRUE) - 5 ~ "5b",
TRUE ~ NA_character_)
) %>%
ungroup() %>%
arrange(Date, E_P)
My dataset is quite large therefore I will just provide sample data. The data I use is more complex and the code should, therefore, allow for varying lengths of the column Date and also for multiple values that are missing (NAs):
df <- data.frame( Date = c(rep("2010-01-31",16), rep("2010-02-28", 14)), Value=c(rep(c(1,2,3,4,5,6,7,8,9,NA,NA,NA,NA,NA,15),2))
Afterward, I would also like to test the minimum size of quintiles i.e. how many data points are minimum in each quintile in the entire dataset.
The expected output would look like this:
structure(list(Date = structure(c(14640, 14640, 14640, 14640,
14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640,
14640, 14640, 14640, 14668, 14668, 14668, 14668, 14668, 14668,
14668, 14668, 14668, 14668, 14668, 14668, 14668, 14668), class = "Date"),
Value = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA,
NA, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA, NA), R = c(1L,
1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, NA, NA, NA, NA,
NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA, NA, NA, NA, NA
), S_P = c("Worst", "Worst", "Worst", NA, NA, NA, NA, "Best",
"Best", "Best", NA, NA, NA, NA, NA, NA, "Worst", "Worst", NA, NA,
NA, NA, NA, "Best", "Best", NA, NA, NA, NA, NA)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Probably, you could use something like this with quantile :
library(dplyr)
out <- CombData %>%
group_by(Date) %>%
mutate(S_P = case_when(Value <= quantile(Value, 0.2, na.rm = TRUE) ~ 'Worst',
Value >= quantile(Value, 0.8, na.rm = TRUE) ~ 'Best'))
You could change the value of quantile according to your preference.
To get minimum number of "Best" and "Worst" we can do :
out %>%
count(Date, S_P) %>%
na.omit() %>%
ungroup() %>%
select(-Date) %>%
group_by(S_P) %>%
top_n(-1, n)
# S_P n
# <chr> <int>
#1 Best 2
#2 Worst 2
When I understand you correctly, you want to rank your column 'Value' and mark those with rank below the quantile 20% with "worst" and those above 80% with "best". After that you want a table.
You could use use ave for both, the ranking and the quantile identification. The quantile function yields three groups, that you can identify with findInterval, code as a factor variable and label them at will. I'm not sure, though, which ranks should be included in the quantiles, I therefore make the E_P coding in two separate columns for comparison purposes.
dat2 <- within(dat, {
R <- ave(Value, Date, FUN=function(x) rank(x, na.last="keep"))
E_P <- ave(R, Date, FUN=function(x) {
findInterval(x, quantile(R, c(.2, .8), na.rm=TRUE))
})
E_P.fac <- factor(E_P, labels=c("worst", NA, "best"))
})
dat2 <- dat2[order(dat2$Date, dat2$E_P), ] ## order by date and E_P
Yields:
dat2
# Date Value E_P.fac E_P R
# 1 2010-01-31 1 worst 0 1.5
# 16 2010-01-31 1 worst 0 1.5
# 2 2010-01-31 2 <NA> 1 3.0
# 3 2010-01-31 3 <NA> 1 4.0
# 4 2010-01-31 4 <NA> 1 5.0
# 5 2010-01-31 5 <NA> 1 6.0
# 6 2010-01-31 6 <NA> 1 7.0
# 7 2010-01-31 7 <NA> 1 8.0
# 8 2010-01-31 8 best 2 9.0
# 9 2010-01-31 9 best 2 10.0
# 15 2010-01-31 15 best 2 11.0
# 10 2010-01-31 NA <NA> NA NA
# 11 2010-01-31 NA <NA> NA NA
# 12 2010-01-31 NA <NA> NA NA
# 13 2010-01-31 NA <NA> NA NA
# 14 2010-01-31 NA <NA> NA NA
# 17 2010-02-28 2 worst 0 1.0
# 18 2010-02-28 3 worst 0 2.0
# 19 2010-02-28 4 <NA> 1 3.0
# 20 2010-02-28 5 <NA> 1 4.0
# 21 2010-02-28 6 <NA> 1 5.0
# 22 2010-02-28 7 <NA> 1 6.0
# 23 2010-02-28 8 <NA> 1 7.0
# 24 2010-02-28 9 <NA> 1 8.0
# 30 2010-02-28 15 best 2 9.0
# 25 2010-02-28 NA <NA> NA NA
# 26 2010-02-28 NA <NA> NA NA
# 27 2010-02-28 NA <NA> NA NA
# 28 2010-02-28 NA <NA> NA NA
# 29 2010-02-28 NA <NA> NA NA
When I check the quintiles of the Rank column, it appears to be right.
quantile(dat2$R, c(.2, .8), na.rm=TRUE)
# 20% 80%
# 2.8 8.2
After that you could just make a table to get the numbers of each category.
with(dat2, table(Date, E_P.fac))
# E_P.fac
# Date worst <NA> best
# 2010-01-31 2 6 3
# 2010-02-28 2 6 1
Data
dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010-01-31", "2010-02-28"
), class = "factor"), Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, NA,
NA, NA, NA, NA, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, NA, NA, NA, NA,
NA, 15)), row.names = c(NA, -30L), class = "data.frame")

Remove NAs by ID

I am dealing with a dataset like this
Id Value Date
1 250 NA
1 250 2010-06-21
2 6 NA
2 6 2012-08-23
3 545 NA
7 3310 NA
My goal is to remove entire rows if there is an NA in Date column and ID is duplicate. The final output should look like:
Id Value Date
1 250 2010-06-21
2 6 2012-08-23
3 545 NA
7 3310 NA
df1[!(is.na(df1$Date) & duplicated(df1$Id) | duplicated(df1$Id, fromLast = TRUE)),]
# Id Value Date
#2 1 250 2010-06-21
#4 2 6 2012-08-23
#5 3 545 <NA>
#6 7 3310 <NA>
DATA
df1 = structure(list(Id = c(1L, 1L, 2L, 2L, 3L, 7L), Value = c(250L,
250L, 6L, 6L, 545L, 3310L), Date = c(NA, "2010-06-21", NA, "2012-08-23",
NA, NA)), .Names = c("Id", "Value", "Date"), class = "data.frame", row.names = c(NA,
-6L))

Resources