I have a huge dataset and wish to replace values in certain columns (VAR1, VAR2) with NA if they do not start with AA or DD.
Data:
DF<-tibble::tribble(
~ID, ~VAR1, ~VAR1DATE, ~VAR2, ~VAR2DATE,
1L, "AABB", "2001-01-01", "BBAA", "2001-01-01",
2L, "AACC", "2001-01-02", "AACC", "2001-01-02",
3L, "CCDD", "2001-01-03", "DDCC", "2001-01-03",
4L, "DDAA", "2001-01-04", "CCBB", "2001-01-04",
5L, "CCBB", "2001-01-05", "CCBB", "2001-01-05"
)
Desired output:
A tibble: 5 × 5
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
Is there an elegant and smart way to this? mutate_all?
We may do this in two steps - loop across the columns that have 'VAR' followed by digits (\\d+) in column names, replace the values where the first two characters are not AA or DD to NA, then replace the corresponding DATE column to NA based on the NA in the 'VAR1', 'VAR2' columns
library(dplyr)
library(stringr)
DF %>%
mutate(across(matches("^VAR\\d+$"),
~ replace(., !substr(., 1, 2) %in% c("AA", "DD"), NA)),
across(ends_with("DATE"),
~ replace(., is.na(get(str_remove(cur_column(), "DATE"))), NA)))
-output
# A tibble: 5 × 5
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 <NA> <NA>
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 <NA> <NA> DDCC 2001-01-03
4 4 DDAA 2001-01-04 <NA> <NA>
5 5 <NA> <NA> <NA> <NA>
Use mutate/across with the assignment function is.na<-.
DF %>%
mutate(across(starts_with("VAR"), \(x) `is.na<-`(x, !grepl("^AA|^DD", x))))
## A tibble: 5 x 5
# ID VAR1 VAR1DATE VAR2 VAR2DATE
# <int> <chr> <chr> <chr> <chr>
#1 1 AABB NA NA NA
#2 2 AACC NA AACC NA
#3 3 NA NA DDCC NA
#4 4 DDAA NA NA NA
#5 5 NA NA NA NA
Or simpler:
DF %>%
mutate(across(starts_with("VAR"), ~`is.na<-`(., !grepl("^AA|^DD", .))))
Here is a tidyverse solution. Using across with str_replace_all and appending two ifelse statements.
library(dplyr)
library(stringr)
DF %>%
mutate(across(c(VAR1, VAR2), ~str_replace_all(., "^[^AA|DD]", NA_character_))) %>%
mutate(VAR1DATE = ifelse(is.na(VAR1), NA_character_, VAR1DATE),
VAR2DATE = ifelse(is.na(VAR2), NA_character_, VAR2DATE))
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
Another possibility, using tidyr::unite and tiydr::separate:
library(tidyverse)
DF<-tibble::tribble(
~ID, ~VAR1, ~VAR1DATE, ~VAR2, ~VAR2DATE,
1L, "AABB", "2001-01-01", "BBAA", "2001-01-01",
2L, "AACC", "2001-01-02", "AACC", "2001-01-02",
3L, "CCDD", "2001-01-03", "DDCC", "2001-01-03",
4L, "DDAA", "2001-01-04", "CCBB", "2001-01-04",
5L, "CCBB", "2001-01-05", "CCBB", "2001-01-05"
)
DF %>%
unite(VAR1,VAR1,VAR1DATE) %>% unite(VAR2,VAR2,VAR2DATE) %>%
mutate(across(starts_with("VAR"),~if_else(str_detect(.x, "^AA|^DD"), .x, NA_character_))) %>%
separate(VAR1,into = c("VAR1", "VAR1DATE"), sep = "_") %>%
separate(VAR2,into = c("VAR2", "VAR2DATE"), sep = "_")
#> # A tibble: 5 × 5
#> ID VAR1 VAR1DATE VAR2 VAR2DATE
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 AABB 2001-01-01 <NA> <NA>
#> 2 2 AACC 2001-01-02 AACC 2001-01-02
#> 3 3 <NA> <NA> DDCC 2001-01-03
#> 4 4 DDAA 2001-01-04 <NA> <NA>
#> 5 5 <NA> <NA> <NA> <NA>
Here is another tidyverse solution with using str_detect to determine where to convert to NA for the date columns. Then, we can use the same type of function on VAR1 and VAR2.
library(tidyverse)
DF %>%
rowwise %>%
mutate(
VAR1DATE = ifelse(str_detect(VAR1, '^BB|^CC') == TRUE, NA, VAR1DATE),
VAR2DATE = ifelse(str_detect(VAR2, '^BB|^CC') == TRUE, NA, VAR2DATE)
) %>%
mutate(across(c(VAR1, VAR2), function(x)
ifelse(str_detect(x, '^BB|^CC') == TRUE, NA, x)))
Output
# A tibble: 5 × 5
# Rowwise:
ID VAR1 VAR1DATE VAR2 VAR2DATE
<int> <chr> <chr> <chr> <chr>
1 1 AABB 2001-01-01 NA NA
2 2 AACC 2001-01-02 AACC 2001-01-02
3 3 NA NA DDCC 2001-01-03
4 4 DDAA 2001-01-04 NA NA
5 5 NA NA NA NA
Related
Any advice how to match strings, within a row, across multiple columns?
Adapted from
Remove rows where all variables are NA using dplyr where they are matching only NAs across columns, and filtering those - not creating a new variable.
Toy example:
library(dplyr)
df <- tibble(a = c('a', 'a', 'a', NA),
b1 = c('b', 'c', NA, NA),
b2 = c('d', NA, NA, NA),
b3 = c('e', NA, NA, NA),
b4 = c('f', NA, NA, NA))
df
# A tibble: 4 x 5
a b1 b2 b3 b4
<chr> <chr> <chr> <chr> <chr>
1 a b d e f
2 a c NA NA NA
3 a NA NA NA NA
4 NA NA NA NA NA
To create a new variable all_na if the whole row is NA:
df %>%
rowwise() %>%
mutate(all_na = all(is.na(across())))
# A tibble: 4 x 6
# Rowwise:
a b1 b2 b3 b4 all_na
<chr> <chr> <chr> <chr> <chr> <lgl>
1 a b d e f FALSE
2 a c NA NA NA FALSE
3 a NA NA NA NA FALSE
4 NA NA NA NA NA TRUE
To create a new variable if just a subset of the columns (starting with 'b') is NA b_is_na
df %>%
rowwise() %>%
mutate(b_is_na = all(is.na(across(starts_with('b'))))) %>%
ungroup()
# A tibble: 4 x 6
a b1 b2 b3 b4 b_is_na
<chr> <chr> <chr> <chr> <chr> <lgl>
1 a b d e f FALSE
2 a c NA NA NA FALSE
3 a NA NA NA NA TRUE
4 NA NA NA NA NA TRUE
Question:
However, I'm not sure how to create a variable if within a row, for a subset of columns is a string match OR NA, for example, 'c' or NA
Desired output:
# A tibble: 4 x 6
a b1 b2 b3 b4 b_is_na
<chr> <chr> <chr> <chr> <chr> <lgl>
1 a b d e f FALSE
2 a c NA NA NA TRUE
3 a NA NA NA NA TRUE
4 NA NA NA NA NA TRUE
A base R option and an efficient vectorized option would be rowSums on a logical matrix
nm1 <- startsWith(names(df), 'b')
df$b_is_na <- rowSums(df[nm1] == 'c'|is.na(df[nm1])) > 0
df$b_is_na
#[1] FALSE TRUE TRUE TRUE
It can be also used with the mutate
library(dplyr)
df %>%
mutate(b_is_na = rowSums(select(., starts_with('b')) ==
'c'|is.na(select(., starts_with('b')))) > 0)
# A tibble: 4 x 6
# a b1 b2 b3 b4 b_is_na
# <chr> <chr> <chr> <chr> <chr> <lgl>
#1 a b d e f FALSE
#2 a c <NA> <NA> <NA> TRUE
#3 a <NA> <NA> <NA> <NA> TRUE
#4 <NA> <NA> <NA> <NA> <NA> TRUE
NOTE: Using rowwise would be an inefficient way
Or with c_across, but it may not be that optimal
df %>%
rowwise %>%
mutate(b_is_na = {
tmp <- c_across(starts_with('b'))
any(is.na(tmp)|tmp == 'c') }) %>%
ungroup
# A tibble: 4 x 6
# a b1 b2 b3 b4 b_is_na
# <chr> <chr> <chr> <chr> <chr> <lgl>
#1 a b d e f FALSE
#2 a c <NA> <NA> <NA> TRUE
#3 a <NA> <NA> <NA> <NA> TRUE
#4 <NA> <NA> <NA> <NA> <NA> TRUE
This question already has answers here:
Calculate max value across multiple columns by multiple groups
(5 answers)
Closed 2 years ago.
I have data which looks basically like this:
id <- c(1:5)
VolumeA <- c(12, NA, NA, NA, NA)
VolumeB <- c(NA, 34, NA, NA, NA)
VolumeC <- c(NA, NA, 56, NA, NA)
VolumeD <- c(NA, NA, NA, 78, NA)
VolumeE <- c(NA, NA, NA, NA, 90)
df_now <- tibble(id, VolumeA, VolumeB, VolumeC, VolumeD, VolumeE)
df_now
# A tibble: 5 x 6
id VolumeA VolumeB VolumeC VolumeD VolumeE
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 12 NA NA NA NA
2 2 NA 34 NA NA NA
3 3 NA NA 56 NA NA
4 4 NA NA NA 78 NA
5 5 NA NA NA NA 90
In the IRL dataset, there are MANY more Volume[label] columns, but in each row I only need one of them: the largest one. So I want to create a new variable which has the largest value:
Volume <- c(12, 34, 56, 78, 90)
df_desired <- cbind(df_now, Volume)
df_desired
id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
1 1 12 NA NA NA NA 12
2 2 NA 34 NA NA NA 34
3 3 NA NA 56 NA NA 56
4 4 NA NA NA 78 NA 78
5 5 NA NA NA NA 90 90
After looking at the dplyr documentation, I tried this...
library(tidyverse)
df_try <- df_now %>%
mutate(Volume = across(contains("Volume"), max, na.rm = TRUE))
...but got back a tibble of data, not a single column. Can someone tell me how to do this properly?
(Please assume, due to issues with my IRL data too complicated to explain here, that I cannot just gather and spread my data. I want to use a conditional mutate.)
Since you have "MANY more Volume[label] columns", any solution that works over each row (rowwise) or individually on each column (with reduce or Reduce) is going to be much slower than necessary.
df_now %>%
mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Proof of relative improvement:
Using Reduce or purrr::reduce or anything that will iterate per column (well, with nc columns, then it will iterate nc-1 times):
mypmax <- function(...) { message("mypmax"); pmax(...); }
df_now %>%
mutate(Volume = reduce(select(., starts_with('Volume')), mypmax, na.rm = TRUE))
# mypmax
# mypmax
# mypmax
# mypmax
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Anything rowwise is doing this once per row, perhaps even worse (assuming more rows than columns in your data:
mymax <- function(...) { message("mymax"); max(...); }
df_now %>%
rowwise %>%
mutate(Volume = mymax(c_across(starts_with('Volume')), na.rm = TRUE))
# mymax
# mymax
# mymax
# mymax
# mymax
# # A tibble: 5 x 7
# # Rowwise:
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
Do it once across all columns, all rows:
mypmax <- function(...) { message("mypmax"); pmax(...); }
df_now %>%
mutate(Volume = do.call(mypmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
# mypmax
# # A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 12 NA NA NA NA 12
# 2 2 NA 34 NA NA NA 34
# 3 3 NA NA 56 NA NA 56
# 4 4 NA NA NA 78 NA 78
# 5 5 NA NA NA NA 90 90
The benchmarking is minor at this scale, but will be more dramatic with larger data:
microbenchmark::microbenchmark(
red = df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE)),
row = df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE)),
sgl = df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# red 4.9736 5.36240 7.240561 5.68010 6.19915 70.7482 100
# row 4.5813 5.02020 6.082047 5.34460 5.70345 63.1166 100
# sgl 3.8270 4.18605 5.803043 4.43215 4.76030 65.7217 100
We can use pmax (first posted the pmax solution here). Note that the relative improvement is very small with do.call
library(dplyr)
library(purrr)
df_now %>%
mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))
# A tibble: 5 x 7
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 12 NA NA NA NA 12
#2 2 NA 34 NA NA NA 34
#3 3 NA NA 56 NA NA 56
#4 4 NA NA NA 78 NA 78
#5 5 NA NA NA NA 90 90
Or with c_across and max (using only tidyverse approaches)
df_now %>%
rowwise %>%
mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))
# A tibble: 5 x 7
# Rowwise:
# id VolumeA VolumeB VolumeC VolumeD VolumeE Volume
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 12 NA NA NA NA 12
#2 2 NA 34 NA NA NA 34
#3 3 NA NA 56 NA NA 56
#4 4 NA NA NA 78 NA 78
#5 5 NA NA NA NA 90 90
Benchmarks
system.time({df_now %>% mutate(Volume = reduce(select(., starts_with('Volume')), pmax, na.rm = TRUE))})
# user system elapsed
# 0.023 0.006 0.029
system.time({df_now %>% rowwise %>% mutate(Volume = max(c_across(starts_with('Volume')), na.rm = TRUE))})
# user system elapsed
# 0.012 0.002 0.015
system.time({df_now %>% mutate(Volume = do.call(pmax, c(select(., starts_with('Volume')), na.rm = TRUE)))})
# user system elapsed
# 0.011 0.001 0.011
NOTE: Not that much difference in timings
I have a dataframe which looks like this:
`Row Labels` Female Male
<chr> <chr> <chr>
1 London <NA> <NA>
2 42 <NA> 1
3 Paris <NA> <NA>
4 36 1 <NA>
5 Belgium <NA> <NA>
6 18 1
7 21 <NA> 1
8 Madrid <NA> <NA>
9 20 1 <NA>
10 Berlin <NA> <NA>
11 37 <NA> 1
12 23 1
13 25 1
14 44 1
The code I used to produce this dataframe looks like this:
structure(list(`Row Labels` = c("London", "42", "Paris","36", "Belgium","18" ,"21", "Madrid", "20", "Berlin", "37","23","25","44"),
Female = c(NA, NA, NA, "1", NA, NA,NA, NA, "1", NA, NA,"1","1","1"), Male = c(NA,"1", NA, NA, NA, "1", NA, NA, NA, "1",NA,NA,NA,NA)),
.Names = c("Row Labels","Female", "Male"), row.names = c(NA, -14L), class = c("tbl_df", "tbl", "data.frame"))
I would like to know how I can change multiple rows in this dataframe to become columns.
My ideal output looks like this:
'Row Labels' Female Male 42 36 21 20 37 18 23 25 44
London 1 1
Paris 1 1
Belgium 1 1 1 1
Madrid 1 1
Berlin 3 1 1 1 1 1
Seems very mechanical. Calling your data d:
d1 = d[seq(1, nrow(d), by = 2), ]
d2 = d[seq(2, nrow(d), by = 2), ]
d1[, c("Male", "Female")] = d2[, c("Male", "Female")]
d3 = matrix(nrow = nrow(d2), ncol = nrow(d2))
diag(d3) = 1
colnames(d3) = d2$`Row Labels`
cbind(d2, d3)
# Row Labels Female Male 42 36 21 20 37
# 1 42 <NA> 1 1 NA NA NA NA
# 2 36 1 <NA> NA 1 NA NA NA
# 3 21 <NA> 1 NA NA 1 NA NA
# 4 20 1 <NA> NA NA NA 1 NA
# 5 37 <NA> 1 NA NA NA NA 1
Using tidyverse.
library(dplyr)
library(tidyr)
#cumsum based on country names
df %>% group_by(gr=cumsum(grepl('\\D+',`Row Labels`))) %>%
#Sum Female and Male
mutate_at(vars('Female','Male'), list(~sum(as.numeric(.), na.rm = T))) %>%
#Create RL from country name and number where we are at numbers
mutate(RL=ifelse(row_number()>1, paste0(first(`Row Labels`),',',`Row Labels`), NA)) %>%
filter(!is.na(RL)) %>%
select(RL, gr, Male, Female) %>%
separate(RL, into = c('RL','Age')) %>% mutate(flag=1) %>% spread(Age, flag) %>%
ungroup() %>% select(-gr)
# A tibble: 5 x 12
RL Male Female `18` `20` `21` `23` `25` `36` `37` `42` `44`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Belgium 1 0 1 NA 1 NA NA NA NA NA NA
2 Berlin 1 3 NA NA NA 1 1 NA 1 NA 1
3 London 1 0 NA NA NA NA NA NA NA 1 NA
4 Madrid 0 1 NA 1 NA NA NA NA NA NA NA
5 Paris 0 1 NA NA NA NA NA 1 NA NA NA
I have some data that looks like this:
samp
# A tibble: 5 x 2
ID Source
<dbl> <chr>
1 34221 75
2 33861 75
3 59741 126,123
4 56561 111,105
5 55836 36,34,34,36,22
Of any of the distinct values, I want to make a new column. If the value exists in a row I want to impute an "x" otherwise no value should be imputed.
Example (pseudo code) of the expected result:
ID 75 126 123 111 105 36 34 22
1 34221 x
2 33861 x
3 59741 x x
4 56561 x x
5 55836 x x x
I tried it by the separtate function of the tydr package. Like this for the start.
into = unique(unlist(strsplit(samp$Source, ",")))
samp %>% separate(col = "Source", into = into, sep = ",")
However, this doesn´t work, because if there are more then one value in a row the values will not be assigned to the respective column (e.g. for the ID 59741 the value 126 is in column 75 and not in the column 126).
A tibble: 5 x 9
ID `75` `126` `123` `111` `105` `36` `34` `22`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 34221 75 NA NA NA NA NA NA NA
2 33861 75 NA NA NA NA NA NA NA
3 59741 126 123 NA NA NA NA NA NA
4 56561 111 105 NA NA NA NA NA NA
5 55836 36 34 34 36 22 NA NA NA
Here is a dput:
structure(list(ID = c(34221, 33861, 59741, 56561, 55836), Source = c("75",
"75", "126,123", "111,105", "36,34,34,36,22")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
Could also do:
library(tidyverse)
df %>%
mutate(Source = strsplit(Source, ","),
dummy = "x") %>%
unnest() %>% distinct() %>%
spread(Source, dummy)
Output:
ID `105` `111` `123` `126` `22` `34` `36` `75`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 33861 NA NA NA NA NA NA NA x
2 34221 NA NA NA NA NA NA NA x
3 55836 NA NA NA NA x x x NA
4 56561 x x NA NA NA NA NA NA
5 59741 NA NA x x NA NA NA NA
The package splitstackshape is very handy for such operations, i.e.
library(splitstackshape)
cSplit_e(df, "Source", mode = "binary", type = "character", fill = 0, drop = TRUE)
which gives,
ID Source_105 Source_111 Source_123 Source_126 Source_22 Source_34 Source_36 Source_75
1 34221 0 0 0 0 0 0 0 1
2 33861 0 0 0 0 0 0 0 1
3 59741 0 0 1 1 0 0 0 0
4 56561 1 1 0 0 0 0 0 0
5 55836 0 0 0 0 1 1 1 0
Another option is using tidyr::separate_rows
library(dplyr)
library(tidyr)
df %>% separate_rows(Source,sep=',') %>% distinct() %>%
mutate(dummy='X') %>% spread(Source,dummy)
ID 105 111 123 126 22 34 36 75
1 33861 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
2 34221 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
3 55836 <NA> <NA> <NA> <NA> X X X <NA>
4 56561 X X <NA> <NA> <NA> <NA> <NA> <NA>
5 59741 <NA> <NA> X X <NA> <NA> <NA> <NA>
I have a tibble with information about diagnoses:
data <- tibble(
id = c(1:10),
diagnosis_1 = c("F32", "F431", "R58", "S32", "F11", NA, NA, "Y67", "F32", "Z032"),
diagnosis_2 = c(NA, NA, NA, NA, NA, NA, "G35", NA, NA, NA),
diagnosis_3 = c("F40", NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA),
diagnosis_4 = c(NA, NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)
)
As a part of the cleaning process, I have removed all diagnoses not fulfilling certain criteria (i.e. not starting with the letter F, G, or Z). With the following code:
data$diagnosis_1[str_sub(data$diagnosis_1, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_2[str_sub(data$diagnosis_2, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_3[str_sub(data$diagnosis_3, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_4[str_sub(data$diagnosis_4, 1,1) %in% c("R", "S", "Y")] <- NA
Ending up with this tibble:
I now need to move the data to the left to fill the columns from left to right (i.e diagnosis_1 not being empty if diagnosis_2, diagnosis_3 or diagnosis_4 has data). I have tried using ifelse() as it is vectorized but I can`t seem to get it to work with several nested ifelse().
ifelse(is.na(data$diagnosis_1), data$diagnosis_2, data$diagnosis_1))
All suggestions are much appreciated.
Edit: adding expected output:
Using dplyr and tidyr. Reshape from wide to long, exclude "^RSY" and NA diagnosis, reshape long to wide.
library(dplyr)
library(tidyr)
gather(data, key = "k", value = "v", -id) %>%
filter(!(grepl("^[R|S|Y]", v) | is.na(v))) %>%
group_by(id) %>%
mutate(diagN = paste0("diagnosis_", row_number())) %>%
select(-k) %>%
spread(key = "diagN", value = "v") %>%
ungroup()
# # A tibble: 10 x 3
# id diagnosis_1 diagnosis_2
# <int> <chr> <chr>
# 1 1 F32 F40
# 2 2 F431 NA
# 3 3 F65 NA
# 4 4 F431 NA
# 5 5 F11 F19
# 6 6 F60 NA
# 7 7 G35 NA
# 8 8 F32 NA
# 9 9 F32 F11
# 10 10 Z032 NA
We first replace values which start with either "R", "S" or "Y" to NA and then left shift the non-NA values.
data[-1] <- lapply(data[-1], function(x) replace(x, grepl("^[R|S|Y]", x), NA))
data[] <- t(apply(data, 1, function(x) `length<-`(na.omit(x), length(x))))
data
# A tibble: 10 x 5
# id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
# <chr> <chr> <chr> <chr> <chr>
# 1 " 1" F32 F40 NA NA
# 2 " 2" F431 NA NA NA
# 3 " 3" F65 NA NA NA
# 4 " 4" F431 NA NA NA
# 5 " 5" F11 F19 NA NA
# 6 " 6" F60 NA NA NA
# 7 " 7" G35 NA NA NA
# 8 " 8" F32 NA NA NA
# 9 " 9" F32 F11 NA NA
#10 10 Z032 NA NA NA
Shifting the non-NA value to left has been taken from David's answer from here. You can try any other approach to shift values from the same question as well.
You can try a tidyverse
library(tidyverse)
data %>%
mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>%
gather(k,v, -id) %>%
group_by(id) %>%
arrange(id) %>%
mutate(v=ifelse(k == "diagnosis_1", v[!is.na(v)][1], v)) %>%
spread(k, v)
# A tibble: 10 x 5
# Groups: id [10]
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 NA F40 NA
2 2 F431 NA NA NA
3 3 F65 NA NA F65
4 4 F431 NA F431 NA
5 5 F11 NA NA F19
6 6 F60 NA F60 NA
7 7 G35 G35 NA NA
8 8 F32 NA NA F32
9 9 F32 NA F11 NA
10 10 Z032 NA NA NA
As its unclear what OP wants (see discussion below) you can also try
data %>%
mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>%
gather(k,v, -id) %>%
group_by(id) %>%
arrange(id) %>%
mutate(v=c(v[!is.na(v)], rep(NA, length(v) - length(v[!is.na(v)])))) %>%
spread(k, v)
# A tibble: 10 x 5
# Groups: id [10]
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 F40 NA NA
2 2 F431 NA NA NA
3 3 F65 NA NA NA
4 4 F431 NA NA NA
5 5 F11 F19 NA NA
6 6 F60 NA NA NA
7 7 G35 NA NA NA
8 8 F32 NA NA NA
9 9 F32 F11 NA NA
10 10 Z032 NA NA NA
You can use Reduce along with coalesce from dplyr, i.e.
df$diagnosis_1 <- Reduce(dplyr::coalesce, df[-1])
#id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
# <int> <chr> <chr> <chr> <chr>
# 1 1 F32 <NA> F40 <NA>
# 2 2 F431 <NA> <NA> <NA>
# 3 3 F65 <NA> <NA> F65
# 4 4 F431 <NA> F431 <NA>
# 5 5 F11 <NA> <NA> F19
# 6 6 F60 <NA> F60 <NA>
# 7 7 G35 G35 <NA> <NA>
# 8 8 F32 <NA> <NA> F32
# 9 9 F32 <NA> F11 <NA>
#10 10 Z032 <NA> <NA> <NA>
Below solution using function na_move from package dedupewider.
library(dedupewider)
na_move(data) # 'right' direction is by default
#> # A tibble: 10 x 5
#> id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
#> * <chr> <chr> <chr> <lgl> <lgl>
#> 1 1 F32 F40 NA NA
#> 2 2 F431 <NA> NA NA
#> 3 3 F65 <NA> NA NA
#> 4 4 F431 <NA> NA NA
#> 5 5 F11 F19 NA NA
#> 6 6 F60 <NA> NA NA
#> 7 7 G35 <NA> NA NA
#> 8 8 F32 <NA> NA NA
#> 9 9 F32 F11 NA NA
#> 10 10 Z032 <NA> NA NA
A tidyr update, using pivot_longer and unnest_wider.
dplyr 1.0.10 CRAN release: 2022-09-01
tidyr 1.2.1 CRAN release: 2022-09-08
Step 1: clean up data
library(dplyr)
library(tidyr)
data <- data %>%
mutate(across(starts_with("diag"), ~
replace(.x, grepl(paste0("^", c("R", "S", "Y"), collapse="|"), .x), NA)))
Step 2: left-compact data
data %>%
pivot_longer(starts_with("diag")) %>%
group_by(id) %>%
mutate(value = value[order(is.na(value))]) %>%
summarize(col = list(value)) %>%
unnest_wider(col, names_sep="_") %>%
setNames(colnames({{data}}))
# A tibble: 10 × 5
id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
<int> <chr> <chr> <chr> <chr>
1 1 F32 F40 NA NA
2 2 F431 NA NA NA
3 3 F65 NA NA NA
4 4 F431 NA NA NA
5 5 F11 F19 NA NA
6 6 F60 NA NA NA
7 7 G35 NA NA NA
8 8 F32 NA NA NA
9 9 F32 F11 NA NA
10 10 Z032 NA NA NA
data
data <- structure(list(id = 1:10, diagnosis_1 = c("F32", "F431", "R58",
"S32", "F11", NA, NA, "Y67", "F32", "Z032"), diagnosis_2 = c(NA,
NA, NA, NA, NA, NA, "G35", NA, NA, NA), diagnosis_3 = c("F40",
NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA), diagnosis_4 = c(NA,
NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))