Add columns where identifiers fit multiple patterns - r

A data frame df has one row per for every distinct value of its numeric vector id0 - but trailing zeros in cells for id0 indicate important groups along which the file must be transformed. Here are 12 observations of df:
row id0 id0_ntz
a 111000 3
b 111010 1
c 112345 0
d 111974 0
e 112090 1
f 114000 3
g 114099 0
h 555001 0
i 555012 0
j 461000 3
k 461020 1
l 111090 0
Let's call every value of id0 with three trailing zeros (i.e., where id0_ntz == 3) a "big id" and every value that doesn't fit this pattern a "little id." The 12 obs above include three big ids (row values a, f, and j). For each big id, I need to:
Find every other value of id0 that matches the first three digits of this ith big id
Add the value of id0 for each match to one of j discrete vectors called idj, where j is a suffix ranging from 1 to j that effectively counts the number of matching little ids nested in the ith big id.
If df only included the 12 rows shown above, the correct result would look like this:
row id0 id0_ntz id1 id2 id3
a 111000 3 111010 111974 111090
b 111010 1 NA NA NA
c 112345 0 NA NA NA
d 111974 0 NA NA NA
e 112090 1 NA NA NA
f 114000 3 114099 NA NA
g 114099 0 NA NA NA
h 555001 0 NA NA NA
i 555012 0 NA NA NA
j 461000 3 461020 NA NA
k 461020 1 NA NA NA
l 111090 0 NA NA NA
I'm open to any solution that solves this problem dynamically (i.e., is agnostic to the number of big ids, little ids, and resulting idj vectors).
P.S.: I need to do the same thing again where id0_ntz == 2, then 1, but an acceptable answer to this posted question only requires a solution that solves the problem where id0_ntz == 3.

This will serve your purpose
df <- read.table(text = 'row id0 id0_ntz
a 111000 3
b 111010 1
c 112345 0
d 111974 0
e 112090 1
f 114000 3
g 114099 0
h 555001 0
i 555012 0
j 461000 3
k 461020 1
l 111090 0', header = T)
df$id0 <- as.character(df$id0)
library(tidyverse)
df %>%
filter(id0_ntz == 3) %>%
mutate(big_id = substr(id0, 1, 3)) -> big_id
df %>% mutate(id0 = as.character(id0)) %>%
left_join(df %>% mutate(id = as.character(id0),
dummy = match(substr(id, 1, 3), big_id$big_id)) %>%
filter(!is.na(dummy)) %>%
group_by(dummy) %>%
mutate(d2 = paste0('id', row_number() - 1)) %>% select(-id0) %>%
pivot_wider(id_cols = dummy, names_from = d2, values_from = id),
by = c('id0')) %>%
select(-dummy)
#> row id0 id0_ntz id1 id2 id3
#> 1 a 111000 3 111010 111974 111090
#> 2 b 111010 1 <NA> <NA> <NA>
#> 3 c 112345 0 <NA> <NA> <NA>
#> 4 d 111974 0 <NA> <NA> <NA>
#> 5 e 112090 1 <NA> <NA> <NA>
#> 6 f 114000 3 114099 <NA> <NA>
#> 7 g 114099 0 <NA> <NA> <NA>
#> 8 h 555001 0 <NA> <NA> <NA>
#> 9 i 555012 0 <NA> <NA> <NA>
#> 10 j 461000 3 461020 <NA> <NA>
#> 11 k 461020 1 <NA> <NA> <NA>
#> 12 l 111090 0 <NA> <NA> <NA>
Created on 2021-05-28 by the reprex package (v2.0.0)

I'd use the approach below. Its quite short.
library(tidyverse)
df %>%
group_by(big_id = substr(id0, 1, 3)) %>%
mutate(id = ifelse(substr(id0, 4, 6) == "000",
list(setdiff(unique(id0),
paste0(big_id, "000"))),
list())) %>%
unnest_wider(col = id,
names_sep = "")
#> # A tibble: 12 x 7
#> # Groups: big_id [5]
#> row id0 id0_ntz big_id id1 id2 id3
#> <chr> <int> <int> <chr> <int> <int> <int>
#> 1 a 111000 3 111 111010 111974 111090
#> 2 b 111010 1 111 NA NA NA
#> 3 c 112345 0 112 NA NA NA
#> 4 d 111974 0 111 NA NA NA
#> 5 e 112090 1 112 NA NA NA
#> 6 f 114000 3 114 114099 NA NA
#> 7 g 114099 0 114 NA NA NA
#> 8 h 555001 0 555 NA NA NA
#> 9 i 555012 0 555 NA NA NA
#> 10 j 461000 3 461 461020 NA NA
#> 11 k 461020 1 461 NA NA NA
#> 12 l 111090 0 111 NA NA NA
Created on 2021-05-27 by the reprex package (v0.3.0)

library(tidyverse)
df <- read.table(text = 'row id0 id0_ntz
a 111000 3
b 111010 1
c 112345 0
d 111974 0
e 112090 1
f 114000 3
g 114099 0
h 555001 0
i 555012 0
j 461000 3
k 461020 1
l 111090 0', header = T)
df %>%
mutate(id = id0 %/% 1000 * 1000) %>%
group_by(id) %>%
mutate(row_id = row_number() - 1) %>%
ungroup() %>%
filter(row_id != 0) %>%
pivot_wider(id, names_from = row_id, values_from = id0, names_prefix = "id") %>%
right_join(df, by = c("id" = "id0")) %>%
rename(id0 = id) %>%
arrange(row)
#> # A tibble: 12 x 6
#> id0 id1 id2 id3 row id0_ntz
#> <dbl> <int> <int> <int> <chr> <int>
#> 1 111000 111010 111974 111090 a 3
#> 2 111010 NA NA NA b 1
#> 3 112345 NA NA NA c 0
#> 4 111974 NA NA NA d 0
#> 5 112090 NA NA NA e 1
#> 6 114000 114099 NA NA f 3
#> 7 114099 NA NA NA g 0
#> 8 555001 NA NA NA h 0
#> 9 555012 NA NA NA i 0
#> 10 461000 461020 NA NA j 3
#> 11 461020 NA NA NA k 1
#> 12 111090 NA NA NA l 0
Created on 2021-05-27 by the reprex package (v2.0.0)

Related

How to Filter by group and move all values to new column if any value in any of the affected columns is greater than 5 in R

I have a Datafaame like this:
dt <- tibble(
TRIAL = c("A", "A", "A", "B", "B", "B", "C", "C", "C","D","D","D"),
RL = c(1, NA, 3, 1, 6, 3, 2, 3, 1, 0, 1.5, NA),
SL = c(6, 1.5, 1, 0, 0, 1, 1, 2, 0, 1, 1.5, NA),
HC = c(0, 1, 5, 6,7, 8, 9, 3, 4, 5, 4, 2)
)
# A tibble: 12 x 4
TRIAL RL SL HC
<chr> <dbl> <dbl> <dbl>
1 A 1 6 0
2 A NA 1.5 1
3 A 3 1 5
4 B 1 0 6
5 B 6 0 7
6 B 3 1 8
7 C 2 1 9
8 C 3 2 3
9 C 1 0 4
10 D 0 1 5
11 D 1.5 1.5 4
12 D NA NA 2
I want to group the data frame by TRIAL and have the values in RL and SL checked by group, if the value in either of the column is greater than 5 then move all values for RL and SL for that particular group to RLCT and SLCT respectively.
# A tibble: 12 x 6
TRIAL HC RLCT SLCT SL RL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0 1 6 NA NA
2 A 1 NA 1.5 NA NA
3 A 5 3 1 NA NA
4 B 6 1 0 NA NA
5 B 7 6 0 NA NA
6 B 8 3 1 NA NA
7 C 9 NA NA 1 3
8 C 3 NA NA 3 5
9 C 4 NA NA 1 1
10 D 5 NA NA 1 0
11 D 4 NA NA 1.5 1.5
12 D 2 NA NA NA NA
When I run the below code, I did not get the expected output
dt0 <- dt %>%
mutate(RLCT = NA,
SLCT = NA) %>%
group_by(TRIAL) %>%
filter(!any(RL > 5.0 | SL > 5.0))
dt1 <- dt %>%
group_by(TRIAL) %>%
filter(any(RL > 5.0 | SL > 5.0)) %>%
mutate(RLCT = RL,
SLCT = SL) %>%
rbind(dt0, .) %>%
mutate(RL = ifelse(!is.na(RLCT), NA, RL),
SL = ifelse(!is.na(SLCT), NA, SL)) %>% arrange(TRIAL)
This is what I get
# A tibble: 9 x 6
# Groups: TRIAL [3]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
You can define a column to storage the condition, and change RL and SL with ifelse inside across.
dt %>%
group_by(TRIAL) %>%
mutate(cond = any(RL > 5.0 | SL > 5.0, na.rm = TRUE),
across(c(RL, SL), ~ ifelse(cond, ., NA), .names = "{.col}CT"),
across(c(RL, SL), ~ ifelse(!cond, ., NA)),
cond = NULL)
Result:
# A tibble: 12 x 6
# Groups: TRIAL [4]
TRIAL RL SL HC RLCT SLCT
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA NA 0 1 6
2 A NA NA 1 NA 1.5
3 A NA NA 5 3 1
4 B NA NA 6 1 0
5 B NA NA 7 6 0
6 B NA NA 8 3 1
7 C 2 1 9 NA NA
8 C 3 2 3 NA NA
9 C 1 0 4 NA NA
10 D 0 1 5 NA NA
11 D 1.5 1.5 4 NA NA
12 D NA NA 2 NA NA
With dplyr, you could use group_modify():
library(dplyr)
dt %>%
group_by(TRIAL) %>%
group_modify(~ {
if(any(select(.x, c(RL, SL)) > 5, na.rm = TRUE)) {
rename_with(.x, ~ paste0(.x, 'CT'), c(RL, SL))
} else {
.x
}
})
Output
# A tibble: 12 × 6
# Groups: TRIAL [4]
TRIAL RLCT SLCT HC RL SL
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 6 0 NA NA
2 A NA 1.5 1 NA NA
3 A 3 1 5 NA NA
4 B 1 0 6 NA NA
5 B 6 0 7 NA NA
6 B 3 1 8 NA NA
7 C NA NA 9 2 1
8 C NA NA 3 3 2
9 C NA NA 4 1 0
10 D NA NA 5 0 1
11 D NA NA 4 1.5 1.5
12 D NA NA 2 NA NA

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Assign ID to column with NA's

This must be easy but my brain is blocked!
I have this dataframe:
col1
<chr>
1 A
2 B
3 NA
4 C
5 D
6 NA
7 NA
8 E
9 NA
10 F
df <- structure(list(col1 = c("A", "B", NA, "C", "D", NA, NA, "E",
NA, "F")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I want to add a column with uniqueID only for values that are not NA with tidyverse.
Expected output:
col1 uniqueID
<chr> <dbl>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I have tried: n(), row_number(), cur_group_id ....
We could do this easily in data.table. Specify the condition in i i.e. non-NA elements in 'col1', create the column 'uniqueID' with the sequence of elements by assignment (:=)
library(data.table)
setDT(df)[!is.na(col1), uniqueID := seq_len(.N)]
-output
df
col1 uniqueID
1: A 1
2: B 2
3: <NA> NA
4: C 3
5: D 4
6: <NA> NA
7: <NA> NA
8: E 5
9: <NA> NA
10: F 6
In dplyr, we can use replace
library(dplyr)
df %>%
mutate(uniqueID = replace(col1, !is.na(col1),
seq_len(sum(!is.na(col1)))))
-output
# A tibble: 10 x 2
col1 uniqueID
<chr> <chr>
1 A 1
2 B 2
3 <NA> <NA>
4 C 3
5 D 4
6 <NA> <NA>
7 <NA> <NA>
8 E 5
9 <NA> <NA>
10 F 6
Another approach:
library(dplyr)
df %>%
mutate(UniqueID = cumsum(!is.na(col1)),
UniqueID = if_else(is.na(col1), NA_integer_, UniqueID))
# A tibble: 10 x 2
col1 UniqueID
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
A base R option using match + na.omit + unique
transform(
df,
uniqueID = match(col1, na.omit(unique(col1)))
)
gives
col1 uniqueID
1 A 1
2 B 2
3 <NA> NA
4 C 3
5 D 4
6 <NA> NA
7 <NA> NA
8 E 5
9 <NA> NA
10 F 6
A weird tidyverse solution:
library(dplyr)
df %>%
mutate(id = ifelse(is.na(col1), 0, 1),
id = cumsum(id == 1),
id = ifelse(is.na(col1), NA, id))
# A tibble: 10 x 2
col1 id
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6

split row values into columns

I have some data that looks like this:
samp
# A tibble: 5 x 2
ID Source
<dbl> <chr>
1 34221 75
2 33861 75
3 59741 126,123
4 56561 111,105
5 55836 36,34,34,36,22
Of any of the distinct values, I want to make a new column. If the value exists in a row I want to impute an "x" otherwise no value should be imputed.
Example (pseudo code) of the expected result:
ID 75 126 123 111 105 36 34 22
1 34221 x
2 33861 x
3 59741 x x
4 56561 x x
5 55836 x x x
I tried it by the separtate function of the tydr package. Like this for the start.
into = unique(unlist(strsplit(samp$Source, ",")))
samp %>% separate(col = "Source", into = into, sep = ",")
However, this doesn´t work, because if there are more then one value in a row the values will not be assigned to the respective column (e.g. for the ID 59741 the value 126 is in column 75 and not in the column 126).
A tibble: 5 x 9
ID `75` `126` `123` `111` `105` `36` `34` `22`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 34221 75 NA NA NA NA NA NA NA
2 33861 75 NA NA NA NA NA NA NA
3 59741 126 123 NA NA NA NA NA NA
4 56561 111 105 NA NA NA NA NA NA
5 55836 36 34 34 36 22 NA NA NA
Here is a dput:
structure(list(ID = c(34221, 33861, 59741, 56561, 55836), Source = c("75",
"75", "126,123", "111,105", "36,34,34,36,22")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
Could also do:
library(tidyverse)
df %>%
mutate(Source = strsplit(Source, ","),
dummy = "x") %>%
unnest() %>% distinct() %>%
spread(Source, dummy)
Output:
ID `105` `111` `123` `126` `22` `34` `36` `75`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 33861 NA NA NA NA NA NA NA x
2 34221 NA NA NA NA NA NA NA x
3 55836 NA NA NA NA x x x NA
4 56561 x x NA NA NA NA NA NA
5 59741 NA NA x x NA NA NA NA
The package splitstackshape is very handy for such operations, i.e.
library(splitstackshape)
cSplit_e(df, "Source", mode = "binary", type = "character", fill = 0, drop = TRUE)
which gives,
ID Source_105 Source_111 Source_123 Source_126 Source_22 Source_34 Source_36 Source_75
1 34221 0 0 0 0 0 0 0 1
2 33861 0 0 0 0 0 0 0 1
3 59741 0 0 1 1 0 0 0 0
4 56561 1 1 0 0 0 0 0 0
5 55836 0 0 0 0 1 1 1 0
Another option is using tidyr::separate_rows
library(dplyr)
library(tidyr)
df %>% separate_rows(Source,sep=',') %>% distinct() %>%
mutate(dummy='X') %>% spread(Source,dummy)
ID 105 111 123 126 22 34 36 75
1 33861 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
2 34221 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
3 55836 <NA> <NA> <NA> <NA> X X X <NA>
4 56561 X X <NA> <NA> <NA> <NA> <NA> <NA>
5 59741 <NA> <NA> X X <NA> <NA> <NA> <NA>

Filter or ifelse across multiple columns

I'm doing research of the communication lines to a patient when they get sick. So for example: A person gets sick and goes to the doctor (A), then gets to the hospital (B), gets into contact with insurance (C) etc. The order is different for each patient. For instance, one patient will directly go to the hospital while the other person will first check the insurance etc. We've followed patients through the whole process and after the came into contact with a different authority, we let them fill out another survey. So after each authority ("step") we got the score for a survey. This gives me the following dataset set-up (in reality it is a very large dataset):
Patient<-c(1,1,1,1,1,1,1,2,2,2,2)
sample6<-c("A","A","A","A","A","A","A","A","A","A","A")
sample5<-c("Stop","B","B","B","B","B","B","Stop","C","C","C")
sample4<-c(NA,"Stop","C","C","C","C","C",NA, "Stop","F","F")
sample3<-c(NA,NA,"Stop","D","D","D","D",NA, NA,"Stop","G")
sample2<-c(NA,NA,NA,"Stop","E","E","E",NA, NA,NA,"Stop")
sample1<-c(NA,NA,NA,NA, "Stop","F","F",NA,NA,NA, NA)
sample0<-c(NA,NA,NA,NA, NA,"Stop","G",NA,NA,NA, NA)
sample00<-c(NA,NA,NA,NA, NA,NA,"Stop",NA,NA,NA, NA)
Score<-c(90,88,65,44,78,98,66,38,93,88,80)
Time<-c("01-01-2018", "02-01-2018", "03-01-2018", "04-01-2018", "05-01-2018", "06-01-2018", "07-01-2018","01-02-2018", "02-02-2018", "05-02-2018", "06-02-2018")
df<-data.frame("Patient"=Patient, "step0"=sample6, "step1"=sample5, "step2"=sample4, "step3"=sample3, "step4"=sample2,
"step5"=sample1,"step6"= sample0, "step7"=sample00, "Score"=Score, "Time"=Time)
> df
Patient step0 step1 step2 step3 step4 step5 step6 step7 Score Time
1 1 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 90 01-01-2018
2 1 A B Stop <NA> <NA> <NA> <NA> <NA> 88 02-01-2018
3 1 A B C Stop <NA> <NA> <NA> <NA> 65 03-01-2018
4 1 A B C D Stop <NA> <NA> <NA> 44 04-01-2018
5 1 A B C D E Stop <NA> <NA> 78 05-01-2018
6 1 A B C D E F Stop <NA> 98 06-01-2018
7 1 A B C D E F G Stop 66 07-01-2018
8 2 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 38 01-02-2018
9 2 A C Stop <NA> <NA> <NA> <NA> <NA> 93 02-02-2018
10 2 A C F Stop <NA> <NA> <NA> <NA> 88 05-02-2018
11 2 A C F G Stop <NA> <NA> <NA> 80 06-02-2018
So for example: row 1 has the survey score after authority A, row 2 is for the same patient and has the score of the survey after authority B etc.
Now I want to compare columns that have the same final proces, I will take "F" as an example but it could also be "C" for another analysis. So now I want to select all rows that indicate "F" as the final authority AND the row before so that I can compare them.
So I want to create this dataset:
Patient step0 step1 step2 step3 step4 step5 step6 step7 Score Time Indicator
1 1 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 90 01-01-2018 0
2 1 A B Stop <NA> <NA> <NA> <NA> <NA> 88 02-01-2018 0
3 1 A B C Stop <NA> <NA> <NA> <NA> 65 03-01-2018 0
4 1 A B C D Stop <NA> <NA> <NA> 44 04-01-2018 0
5 1 A B C D E Stop <NA> <NA> 78 05-01-2018 Before
6 1 A B C D E F Stop <NA> 98 06-01-2018 After
7 1 A B C D E F G Stop 66 07-01-2018 0
8 2 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 38 01-02-2018 0
9 2 A C Stop <NA> <NA> <NA> <NA> <NA> 93 02-02-2018 Before
10 2 A C F Stop <NA> <NA> <NA> <NA> 88 05-02-2018 After
11 2 A C F G Stop <NA> <NA> <NA> 80 06-02-2018 0
I did manage to indicate the rows that contain "F" plus the previous:
ProcessColumns <- 2:9
d <- df[,ProcessColumns] == "F"
df$Indicator <- rowSums(d,na.rm=T)
df$filter[which(df$filter %in% 1)-1] <- "Before"
df$filter[which(df$filter %in% 1)] <- "After"
But now it indicates ALL the rows containing "F" not just in the end.. anyone who can help me?
We can do something like
df %>% mutate(sum=rowSums(!is.na(.[2:9]))) %>%
group_by(Patient) %>% mutate(max = sum-max(sum), Indicator = case_when(max == -2 ~ "Before", max == -1 ~ "After", TRUE ~ as.character(0)))
# A tibble: 11 x 14
# Groups: Patient [2]
Patient step0 step1 step2 step3 step4 step5 step6 step7 Score Time sum max Ind
<dbl> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <dbl> <fct> <dbl> <dbl> <chr>
1 1.00 A Stop NA NA NA NA NA NA 90.0 01-01-2018 2.00 -6.00 0
2 1.00 A B Stop NA NA NA NA NA 88.0 02-01-2018 3.00 -5.00 0
3 1.00 A B C Stop NA NA NA NA 65.0 03-01-2018 4.00 -4.00 0
4 1.00 A B C D Stop NA NA NA 44.0 04-01-2018 5.00 -3.00 0
5 1.00 A B C D E Stop NA NA 78.0 05-01-2018 6.00 -2.00 Before
6 1.00 A B C D E F Stop NA 98.0 06-01-2018 7.00 -1.00 After
7 1.00 A B C D E F G Stop 66.0 07-01-2018 8.00 0 0
8 2.00 A Stop NA NA NA NA NA NA 38.0 01-02-2018 2.00 -3.00 0
9 2.00 A C Stop NA NA NA NA NA 93.0 02-02-2018 3.00 -2.00 Before
10 2.00 A C F Stop NA NA NA NA 88.0 05-02-2018 4.00 -1.00 After
11 2.00 A C F G Stop NA NA NA 80.0 06-02-2018 5.00 0 0
Update: Inspired by #Andre Elrico answer
df %>% unite(All, matches("step"), sep="", remove=F ) %>%
mutate(Ind = str_detect(All,"BStop"), Indicator = case_when( lead(Ind) == TRUE ~ "Before", Ind == TRUE ~ "After", TRUE ~ as.character(0))) %>%
select(-All,-Ind)
Or you can:
library(dplyr)
After_IND <- df %>% apply(.,1,paste,collapse="") %>% grepl("FStop",.)
Before_IND<- lead(After_IND,1,F)
df$Indicator <- 0
df$Indicator[After_IND]<-"After"
df$Indicator[Before_IND]<-"Before"
# Patient step0 step1 step2 step3 step4 step5 step6 step7 Score Time Indicator
# 1 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 90 01-01-2018 0
# 1 A B Stop <NA> <NA> <NA> <NA> <NA> 88 02-01-2018 0
# 1 A B C Stop <NA> <NA> <NA> <NA> 65 03-01-2018 0
# 1 A B C D Stop <NA> <NA> <NA> 44 04-01-2018 0
# 1 A B C D E Stop <NA> <NA> 78 05-01-2018 Before
# 1 A B C D E F Stop <NA> 98 06-01-2018 After
# 1 A B C D E F G Stop 66 07-01-2018 0
# 2 A Stop <NA> <NA> <NA> <NA> <NA> <NA> 38 01-02-2018 0
# 2 A C Stop <NA> <NA> <NA> <NA> <NA> 93 02-02-2018 Before
# 2 A C F Stop <NA> <NA> <NA> <NA> 88 05-02-2018 After
# 2 A C F G Stop <NA> <NA> <NA> 80 06-02-2018 0
Please note:
If you want to compare B for eg. you have to change:
... %>% grepl("BStop",.)
A tidyverse with lot of lines, but generally works.
library(tidyverse)
df %>%
rownames_to_column() %>%
gather(k,v,-Patient,-rowname,-Score, -Time) %>%
group_by(rowname) %>%
mutate(Indicator=ifelse(any(v %in%"F" ),"After",NA)) %>%
spread(k,v) %>%
arrange(as.numeric(rowname)) %>%
group_by(Patient) %>%
mutate(Indicator=ifelse(duplicated(Indicator), NA, Indicator)) %>%
mutate(Indicator2=ifelse(lead(Indicator) == "After", "Before", NA)) %>%
mutate(Indicator=ifelse(!is.na(Indicator2), Indicator2, Indicator)) %>%
select(Patient, starts_with("step"), Score, Time,Indicator, -Indicator2,-rowname) %>%
ungroup()
# A tibble: 11 x 12
Patient step0 step1 step2 step3 step4 step5 step6 step7 Score Time Indicator
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <fct> <chr>
1 1 A Stop NA NA NA NA NA NA 90 01-01-2018 NA
2 1 A B Stop NA NA NA NA NA 88 02-01-2018 NA
3 1 A B C Stop NA NA NA NA 65 03-01-2018 NA
4 1 A B C D Stop NA NA NA 44 04-01-2018 NA
5 1 A B C D E Stop NA NA 78 05-01-2018 Before
6 1 A B C D E F Stop NA 98 06-01-2018 After
7 1 A B C D E F G Stop 66 07-01-2018 NA
8 2 A Stop NA NA NA NA NA NA 38 01-02-2018 NA
9 2 A C Stop NA NA NA NA NA 93 02-02-2018 Before
10 2 A C F Stop NA NA NA NA 88 05-02-2018 After
11 2 A C F G Stop NA NA NA 80 06-02-2018 NA

Resources