I have a dataframe with a row full of adverse events but also relationships of these adverse events to the procedure, like this:
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
Now I'd like to sort the adverse events alphabetically but of course take the "related", "likely related" rows with the individual adverse event rows, so I'd like to somehow group them first.
In this example it's always 3 rows, but let's assume it could be sometimes 2, 4 or 5 rows too (all except the adverse event rows containing "related" in the string/name though e.g. 'unlikely related').
I know, I can get the indices of the adverse event rows by
grep('related', df$adverse_event, invert = T) but I'm unsure how to use this to group the rows together before sorting them.
Edit: Beginning of the left column of the desired output:
expected_output_left_column <- data.frame(adverse_event = c(
"Ascites", "related", "likely related",
"Biliary leakage / occlusion / fistula", "related", "likely related" ) )
Thank you!
Another solution using base r and lead function from dplyr
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
You can do the following:
library(dplyr)
left_join(
df,
df %>%
filter(!grepl('related',adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
Output:
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 likely related NA 0 NA NA
4 Biliary leakage / occlusion / fistula NA NA 1 NA
5 related NA NA 2 NA
6 likely related NA NA 0 NA
7 Haemorrhage 4 2 1 2
8 related 4 3 4 4
9 likely related 0 0 1 1
10 Hyperbilirubinemia NA 1 NA NA
11 related NA 0 NA NA
12 likely related NA 2 NA NA
13 Liver abscess NA 1 4 NA
14 related NA 1 5 NA
15 likely related NA 0 1 NA
16 Other 3 11 5 NA
17 related 6 3 3 NA
18 likely related 1 7 2 NA
19 Pain 8 2 2 NA
20 related 4 4 5 NA
21 likely related 5 2 1 NA
22 Pleural effusion with drainage NA 1 NA NA
23 related NA 2 NA NA
24 likely related NA 1 NA NA
25 Pneumothorax NA 1 1 NA
26 related NA 1 1 NA
27 likely related NA 0 0 NA
28 Portal vein thrombosis NA NA 1 NA
29 related NA NA 1 NA
30 likely related NA NA 0 NA
31 Sepsis NA NA 1 NA
32 related NA NA 1 NA
33 likely related NA NA 0 NA
34 Subcapsular hematoma 3 1 NA NA
35 related 1 2 NA NA
36 likely related 3 0 NA NA
Note that this uses data.table::nafill().. A full data.table solution is as below:
library(data.table)
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
Add a "group" variable and sort
tmp=!grepl("related",df$adverse_event)
df$grp=cumsum(tmp)
df[order(match(df$grp,order(df$adverse_event[tmp]))),]
adverse_event grade_1 grade_2 grade_3 grade_4 grp
13 Ascites NA 1 NA NA 5
14 related NA 1 NA NA 5
15 likely related NA 0 NA NA 5
28 Biliary leakage / occlusion / fistula NA NA 1 NA 10
29 related NA NA 2 NA 10
30 likely related NA NA 0 NA 10
1 Haemorrhage 4 2 1 2 1
2 related 4 3 4 4 1
3 likely related 0 0 1 1 1
16 Hyperbilirubinemia NA 1 NA NA 6
17 related NA 0 NA NA 6
18 likely related NA 2 NA NA 6
19 Liver abscess NA 1 4 NA 7
20 related NA 1 5 NA 7
21 likely related NA 0 1 NA 7
4 Other 3 11 5 NA 2
5 related 6 3 3 NA 2
6 likely related 1 7 2 NA 2
7 Pain 8 2 2 NA 3
8 related 4 4 5 NA 3
9 likely related 5 2 1 NA 3
22 Pleural effusion with drainage NA 1 NA NA 8
23 related NA 2 NA NA 8
24 likely related NA 1 NA NA 8
25 Pneumothorax NA 1 1 NA 9
26 related NA 1 1 NA 9
27 likely related NA 0 0 NA 9
31 Portal vein thrombosis NA NA 1 NA 11
32 related NA NA 1 NA 11
33 likely related NA NA 0 NA 11
34 Sepsis NA NA 1 NA 12
35 related NA NA 1 NA 12
36 likely related NA NA 0 NA 12
10 Subcapsular hematoma 3 1 NA NA 4
11 related 1 2 NA NA 4
12 likely related 3 0 NA NA 4
Just to throw in another tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
# # A tibble: 36 × 5
# adverse_event grade_1 grade_2 grade_3 grade_4
# <chr> <int> <int> <int> <int>
# 1 Ascites NA 1 NA NA
# 2 related NA 1 NA NA
# 3 likely related NA 0 NA NA
# 4 Biliary leakage / occlusion / fistula NA NA 1 NA
# 5 related NA NA 2 NA
# 6 likely related NA NA 0 NA
# 7 Haemorrhage 4 2 1 2
# 8 related 4 3 4 4
# 9 likely related 0 0 1 1
# 10 Hyperbilirubinemia NA 1 NA NA
# 11 related NA 0 NA NA
# 12 likely related NA 2 NA NA
# 13 Liver abscess NA 1 4 NA
# 14 related NA 1 5 NA
# 15 likely related NA 0 1 NA
# 16 Other 3 11 5 NA
# 17 related 6 3 3 NA
# 18 likely related 1 7 2 NA
# 19 Pain 8 2 2 NA
# 20 related 4 4 5 NA
# 21 likely related 5 2 1 NA
# 22 Pleural effusion with drainage NA 1 NA NA
# 23 related NA 2 NA NA
# 24 likely related NA 1 NA NA
# 25 Pneumothorax NA 1 1 NA
# 26 related NA 1 1 NA
# 27 likely related NA 0 0 NA
# 28 Portal vein thrombosis NA NA 1 NA
# 29 related NA NA 1 NA
# 30 likely related NA NA 0 NA
# 31 Sepsis NA NA 1 NA
# 32 related NA NA 1 NA
# 33 likely related NA NA 0 NA
# 34 Subcapsular hematoma 3 1 NA NA
# 35 related 1 2 NA NA
# 36 likely related 3 0 NA NA
Explanation
mutate + fill: Label each adverse_event with the stem, i.e. re-label all related records with the corresponding event above.
Nest all columns, but keep the newly created grp column, which bears the name of the stem adverse event.
Sort the adverse event stems.
Unnest the rows again.
Remove the grp column.
An approach using rank. Using an extended data set with 4 entries for "Ascites".
library(dplyr)
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 related NA 1 NA NA
4 likely related NA 0 NA NA
5 Biliary leakage / occlusion / fistula NA NA 1 NA
6 related NA NA 2 NA
7 likely related NA NA 0 NA
8 Haemorrhage 4 2 1 2
9 related 4 3 4 4
10 likely related 0 0 1 1
11 Hyperbilirubinemia NA 1 NA NA
12 related NA 0 NA NA
13 likely related NA 2 NA NA
14 Liver abscess NA 1 4 NA
15 related NA 1 5 NA
16 likely related NA 0 1 NA
17 Other 3 11 5 NA
18 related 6 3 3 NA
19 likely related 1 7 2 NA
20 Pain 8 2 2 NA
21 related 4 4 5 NA
22 likely related 5 2 1 NA
23 Pleural effusion with drainage NA 1 NA NA
24 related NA 2 NA NA
25 likely related NA 1 NA NA
26 Pneumothorax NA 1 1 NA
27 related NA 1 1 NA
28 likely related NA 0 0 NA
29 Portal vein thrombosis NA NA 1 NA
30 related NA NA 1 NA
31 likely related NA NA 0 NA
32 Sepsis NA NA 1 NA
33 related NA NA 1 NA
34 likely related NA NA 0 NA
35 Subcapsular hematoma 3 1 NA NA
36 related 1 2 NA NA
37 likely related 3 0 NA NA
extended data
df <- structure(list(adverse_event = c("Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "related", "likely related", "Hyperbilirubinemia",
"related", "likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"), grade_1 = c(4L,
4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), grade_2 = c(2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L,
2L, 1L, 2L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L,
1L, 1L, 1L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA), grade_3 = c(1L,
4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L), grade_4 = c(2L, 4L, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
37L), class = "data.frame")
Here is a benchmark of the different suggestions if needed :
library(bench)
library(dplyr)
library(data.table)
library(tidyr)
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
paul_carteron <- function(df){
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
}
lang_tang_dplyr <- function(df){
left_join(
df,
df %>%
filter(!grepl('related', adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
}
lang_tang_databable <- function(df) {
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
}
andre_wilberg <- function(df){
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
}
thotal <- function(df){
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
}
results = bench::mark(
iterations = 1000, check = FALSE, time_unit = "s", filter_gc = FALSE,
paul_carteron = paul_carteron(df),
lang_tang_dplyr = lang_tang_dplyr(df),
lang_tang_databable = lang_tang_databable(df),
andre_wilberg = andre_wilberg(df),
thotal = thotal(df)
)
plot(results)
I need to check if rows are partially duplicated and delete/overwrite those where 2 columns match a different row where 3 values are present. one problem is, that the "real" dataframe contains a couple of list columns which makes some operations unfeasible. Best case would be if any row where a match can be found would be checked independently of column numbers - meaning only the row with the most columns having non NA values (out of all which include matching column values) is kept.
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA # this row has only 2 values which match values from row 11 but the last value is na
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14 # this row has values in all 3 columns
12 14 NA NA
13 8 11 15 # so does this row
14 16 NA NA
15 17 NA NA
16 18 NA NA
17 19 NA NA
18 20 NA NA
The result should be the same data frame - just without row 7 or where row 7 is overwritten by row 11.
This should be easy to do but for some reason i didn't manage it (except with a convoluted for loop that is hard to generalize should more columns be added at a later time). Is there a straight forward way to do this?
dput of above df:
structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L), o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA), o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)), row.names = c(NA, -18L), class = "data.frame")
If there is already an answer for something like this, please let me know.
I thought of using dplyr:
library(dplyr)
df %>%
mutate(rn = row_number(),
count_na = rowSums(across(o1:o3, is.na))) %>%
group_by(o1, o2) %>%
slice_min(count_na) %>%
arrange(rn) %>%
ungroup() %>%
select(o1:o3)
This returns
# A tibble: 17 x 3
o1 o2 o3
<int> <int> <int>
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 10 NA NA
8 12 NA NA
9 13 NA NA
10 5 9 14
11 14 NA NA
12 8 11 15
13 16 NA NA
14 17 NA NA
15 18 NA NA
16 19 NA NA
17 20 NA NA
This solution is based on the following ideas:
For every row we count the number of NAs in this row.
We group for o1 and o2 to create groups of data that belong together. Here is a possible flaw: perhaps it is a better approach to group by o1 only or do some other grouping. This depends on the structure of your data: should 1, <NA>, <NA> be overwritten by 1, 2, <NA>?
After grouping, we select the row with the smallest number of NAs.
Finally we do some clean up: removing the auxiliary columns, arranging the data and ungrouping.
A partial solution to detect the duplicates, it remains to specify which rows to delete, ran out of time. I've went ahead and "duplicated" a couple more rows.
df=read.table(text="
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14
12 14 NA NA
13 8 11 15
14 16 NA NA
15 7 1 2
16 18 NA NA
17 7 1 3
18 20 NA NA",h=T)
The main trick is to calculate a distance matrix and check which rows have a distance of zero, since dist will automatically estimate a pairwise distance, removing missing values.
tmp=as.matrix(dist(df))
diag(tmp)=NA
tmp[lower.tri(tmp)]=NA
tod=data.frame(which(tmp==0,arr.ind=T))
resulting in
row col
X7 7 11
X6 6 15
X6.1 6 17
Here's another way which considers all columns, should work with any number of columns and regardless of their names or positions
library(dplyr)
mydf <- structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L),
o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA),
o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)),
row.names = c(NA, -18L),
class = "data.frame")
columns <- names(mydf)
dummy_cols <- paste0(columns, "_dummy")
mydf %>%
# duplicate the dataframe
cbind(mydf %>% `names<-`(dummy_cols)) %>%
# arrange across all columns
arrange(across(columns)) %>%
# fill NAs downwards
tidyr::fill(dummy_cols, .direction = "down") %>%
# create a dummy ID
tidyr::unite(id_dummy, dummy_cols, sep = "") %>%
# group by the id
group_by(id_dummy) %>%
# get the first row of each
filter(row_number()==1) %>%
ungroup() %>%
select(columns)
P.S. also replaces 1 - NA - NA with 1 - 2 - NA and replaces 1 - NA - NA with 1 - NA - 3
I have encountered this problem in a project that I'm currently doing.
I have a sparse dataframe and I need to calculate the difference between the first and the last observation per row under some conditions:
Conditions:
If the row only contains NA's then the difference is 0.
If the row contains only 1 observation then the difference is 0.
If row elements (>= 2) are non-NA's then their difference is the difference between the first and the last (tail - head).
The dataframe that I have:
S1 S2 S3 S4 S5
1 NA NA NA NA NA
2 NA 3 NA 5 NA
3 1 NA NA NA 5
4 1 NA 2 NA 7
5 2 NA NA NA NA
6 NA NA 3 4 NA
7 NA NA 3 NA NA
The dataframe that I need:
S1 S2 S3 S4 S5 diff
1 NA NA NA NA NA 0
2 NA 3 NA 5 NA 2
3 1 NA NA NA 5 4
4 1 NA 2 NA 7 6
5 2 NA NA NA NA 0
6 NA NA 3 4 NA 1
7 NA NA 3 NA NA 0
What I've written up till now:
last_minus_first <- function(x, y = na.omit(x)) tail(y, 1) - y[1]
But it doesn't resolve for the fact if the row contains all NA's.
Any help would be much appreciated.
I would suggest using a defined function with apply(). Here the code:
#Data
df <- structure(list(S1 = c(NA, NA, 1L, 1L, 2L, NA, NA), S2 = c(NA,
3L, NA, NA, NA, NA, NA), S3 = c(NA, NA, NA, 2L, NA, 3L, 3L),
S4 = c(NA, 5L, NA, NA, NA, 4L, NA), S5 = c(NA, NA, 5L, 7L,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7"))
Code:
#Function
myown <- function(x)
{
#Check NA
i <- sum(!is.na(x))
#Compute
if(i<=1)
{
y <- 0
} else
{
#Detect positions
j1 <- max(which(!is.na(x)))
j2 <- min(which(!is.na(x)))
#Diff
y <- x[j1]-x[j2]
}
return(y)
}
#Apply function by row
df$NewVar <- apply(df,1,myown)
Output:
S1 S2 S3 S4 S5 NewVar
1 NA NA NA NA NA 0
2 NA 3 NA 5 NA 2
3 1 NA NA NA 5 4
4 1 NA 2 NA 7 6
5 2 NA NA NA NA 0
6 NA NA 3 4 NA 1
7 NA NA 3 NA NA 0
Here's an easier (in my mind) way to handle this, using rowwise from the dplyr package to do calculations along rows.
df %>%
dplyr::rowwise() %>%
dplyr::mutate(max_pop = max(which(!is.na(dplyr::c_across(S1:S5)))),
min_pop = min(which(!is.na(dplyr::c_across(S1:S5)))),
diff = tidyr::replace_na(dplyr::c_across()[max_pop] - dplyr::c_across()[min_pop], 0))
I've broken that mutate call down into the various parts to show what we're doing, but essentially, it goes across all columns in a row to find the last populated column (max_pop), the first populated column (min_pop) and then uses those values to retrieve the values therein.
You have to specify columns for max_pop and min_pop above because creating new interim columns affects the column indexing. c_across() defaults to using all columns, though, so you can actually do this all in one mutate call without specifying any columns.
df %>%
rowwise() %>%
mutate(diff = replace_na(c_across()[max(which(!is.na(c_across())))] - c_across()[min(which(!is.na(c_across())))], 0))
A vectorized option in base R would be to extract the values based on row/column index and then subtract
df1$NewVar <- df1[cbind(seq_len(nrow(df1)), max.col(!is.na(df1), 'last'))] -
df1[cbind(seq_len(nrow(df1)), max.col(!is.na(df1), 'first'))]
df1$NewVar[is.na(df1$NewVar)] <- 0
df1
# S1 S2 S3 S4 S5 NewVar
#1 NA NA NA NA NA 0
#2 NA 3 NA 5 NA 2
#3 1 NA NA NA 5 4
#4 1 NA 2 NA 7 6
#5 2 NA NA NA NA 0
#6 NA NA 3 4 NA 1
#7 NA NA 3 NA NA 0
data
df1 <- structure(list(S1 = c(NA, NA, 1L, 1L, 2L, NA, NA), S2 = c(NA,
3L, NA, NA, NA, NA, NA), S3 = c(NA, NA, NA, 2L, NA, 3L, 3L),
S4 = c(NA, 5L, NA, NA, NA, 4L, NA), S5 = c(NA, NA, 5L, 7L,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7"))
I am looking for a way to change my way in such a way that it sorts the data into quintiles instead of the top 5 and bottom 5. My current code looks like this:
CombData <- CombData %>%
group_by(Date) %>%
mutate(
R=min_rank(Value),
E_P = case_when(
R < 6 ~ "5w",
R > max(R, na.rm =TRUE) - 5 ~ "5b",
TRUE ~ NA_character_)
) %>%
ungroup() %>%
arrange(Date, E_P)
My dataset is quite large therefore I will just provide sample data. The data I use is more complex and the code should, therefore, allow for varying lengths of the column Date and also for multiple values that are missing (NAs):
df <- data.frame( Date = c(rep("2010-01-31",16), rep("2010-02-28", 14)), Value=c(rep(c(1,2,3,4,5,6,7,8,9,NA,NA,NA,NA,NA,15),2))
Afterward, I would also like to test the minimum size of quintiles i.e. how many data points are minimum in each quintile in the entire dataset.
The expected output would look like this:
structure(list(Date = structure(c(14640, 14640, 14640, 14640,
14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640,
14640, 14640, 14640, 14668, 14668, 14668, 14668, 14668, 14668,
14668, 14668, 14668, 14668, 14668, 14668, 14668, 14668), class = "Date"),
Value = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA,
NA, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA, NA), R = c(1L,
1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, NA, NA, NA, NA,
NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA, NA, NA, NA, NA
), S_P = c("Worst", "Worst", "Worst", NA, NA, NA, NA, "Best",
"Best", "Best", NA, NA, NA, NA, NA, NA, "Worst", "Worst", NA, NA,
NA, NA, NA, "Best", "Best", NA, NA, NA, NA, NA)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Probably, you could use something like this with quantile :
library(dplyr)
out <- CombData %>%
group_by(Date) %>%
mutate(S_P = case_when(Value <= quantile(Value, 0.2, na.rm = TRUE) ~ 'Worst',
Value >= quantile(Value, 0.8, na.rm = TRUE) ~ 'Best'))
You could change the value of quantile according to your preference.
To get minimum number of "Best" and "Worst" we can do :
out %>%
count(Date, S_P) %>%
na.omit() %>%
ungroup() %>%
select(-Date) %>%
group_by(S_P) %>%
top_n(-1, n)
# S_P n
# <chr> <int>
#1 Best 2
#2 Worst 2
When I understand you correctly, you want to rank your column 'Value' and mark those with rank below the quantile 20% with "worst" and those above 80% with "best". After that you want a table.
You could use use ave for both, the ranking and the quantile identification. The quantile function yields three groups, that you can identify with findInterval, code as a factor variable and label them at will. I'm not sure, though, which ranks should be included in the quantiles, I therefore make the E_P coding in two separate columns for comparison purposes.
dat2 <- within(dat, {
R <- ave(Value, Date, FUN=function(x) rank(x, na.last="keep"))
E_P <- ave(R, Date, FUN=function(x) {
findInterval(x, quantile(R, c(.2, .8), na.rm=TRUE))
})
E_P.fac <- factor(E_P, labels=c("worst", NA, "best"))
})
dat2 <- dat2[order(dat2$Date, dat2$E_P), ] ## order by date and E_P
Yields:
dat2
# Date Value E_P.fac E_P R
# 1 2010-01-31 1 worst 0 1.5
# 16 2010-01-31 1 worst 0 1.5
# 2 2010-01-31 2 <NA> 1 3.0
# 3 2010-01-31 3 <NA> 1 4.0
# 4 2010-01-31 4 <NA> 1 5.0
# 5 2010-01-31 5 <NA> 1 6.0
# 6 2010-01-31 6 <NA> 1 7.0
# 7 2010-01-31 7 <NA> 1 8.0
# 8 2010-01-31 8 best 2 9.0
# 9 2010-01-31 9 best 2 10.0
# 15 2010-01-31 15 best 2 11.0
# 10 2010-01-31 NA <NA> NA NA
# 11 2010-01-31 NA <NA> NA NA
# 12 2010-01-31 NA <NA> NA NA
# 13 2010-01-31 NA <NA> NA NA
# 14 2010-01-31 NA <NA> NA NA
# 17 2010-02-28 2 worst 0 1.0
# 18 2010-02-28 3 worst 0 2.0
# 19 2010-02-28 4 <NA> 1 3.0
# 20 2010-02-28 5 <NA> 1 4.0
# 21 2010-02-28 6 <NA> 1 5.0
# 22 2010-02-28 7 <NA> 1 6.0
# 23 2010-02-28 8 <NA> 1 7.0
# 24 2010-02-28 9 <NA> 1 8.0
# 30 2010-02-28 15 best 2 9.0
# 25 2010-02-28 NA <NA> NA NA
# 26 2010-02-28 NA <NA> NA NA
# 27 2010-02-28 NA <NA> NA NA
# 28 2010-02-28 NA <NA> NA NA
# 29 2010-02-28 NA <NA> NA NA
When I check the quintiles of the Rank column, it appears to be right.
quantile(dat2$R, c(.2, .8), na.rm=TRUE)
# 20% 80%
# 2.8 8.2
After that you could just make a table to get the numbers of each category.
with(dat2, table(Date, E_P.fac))
# E_P.fac
# Date worst <NA> best
# 2010-01-31 2 6 3
# 2010-02-28 2 6 1
Data
dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010-01-31", "2010-02-28"
), class = "factor"), Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, NA,
NA, NA, NA, NA, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, NA, NA, NA, NA,
NA, 15)), row.names = c(NA, -30L), class = "data.frame")