Assign ID to column with NA's - r

This must be easy but my brain is blocked!
I have this dataframe:
col1
<chr>
1 A
2 B
3 NA
4 C
5 D
6 NA
7 NA
8 E
9 NA
10 F
df <- structure(list(col1 = c("A", "B", NA, "C", "D", NA, NA, "E",
NA, "F")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
I want to add a column with uniqueID only for values that are not NA with tidyverse.
Expected output:
col1 uniqueID
<chr> <dbl>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6
I have tried: n(), row_number(), cur_group_id ....

We could do this easily in data.table. Specify the condition in i i.e. non-NA elements in 'col1', create the column 'uniqueID' with the sequence of elements by assignment (:=)
library(data.table)
setDT(df)[!is.na(col1), uniqueID := seq_len(.N)]
-output
df
col1 uniqueID
1: A 1
2: B 2
3: <NA> NA
4: C 3
5: D 4
6: <NA> NA
7: <NA> NA
8: E 5
9: <NA> NA
10: F 6
In dplyr, we can use replace
library(dplyr)
df %>%
mutate(uniqueID = replace(col1, !is.na(col1),
seq_len(sum(!is.na(col1)))))
-output
# A tibble: 10 x 2
col1 uniqueID
<chr> <chr>
1 A 1
2 B 2
3 <NA> <NA>
4 C 3
5 D 4
6 <NA> <NA>
7 <NA> <NA>
8 E 5
9 <NA> <NA>
10 F 6

Another approach:
library(dplyr)
df %>%
mutate(UniqueID = cumsum(!is.na(col1)),
UniqueID = if_else(is.na(col1), NA_integer_, UniqueID))
# A tibble: 10 x 2
col1 UniqueID
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6

A base R option using match + na.omit + unique
transform(
df,
uniqueID = match(col1, na.omit(unique(col1)))
)
gives
col1 uniqueID
1 A 1
2 B 2
3 <NA> NA
4 C 3
5 D 4
6 <NA> NA
7 <NA> NA
8 E 5
9 <NA> NA
10 F 6

A weird tidyverse solution:
library(dplyr)
df %>%
mutate(id = ifelse(is.na(col1), 0, 1),
id = cumsum(id == 1),
id = ifelse(is.na(col1), NA, id))
# A tibble: 10 x 2
col1 id
<chr> <int>
1 A 1
2 B 2
3 NA NA
4 C 3
5 D 4
6 NA NA
7 NA NA
8 E 5
9 NA NA
10 F 6

Related

How can I compare two columns of different lengths from two dataframes to check for matching values in R?

I have two dataframes that look something like this:
df1
df2 <- data.frame(col1 = c(1,4,5,7), col2 = c("a","c","f", "g"))
df2
df1
col1 col2
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 f
7 7 g
df2
col1 col2
1 1 a
2 2 c
3 3 f
4 4 g
5 10 z
I want to compare the values in col2 of each df and line up the columns of each df by the matches to get this:
col1 col2 col1.1 col2.1
1 1 a 1 a
2 2 b NA <NA>
3 3 c 2 c
4 4 d NA <NA>
5 5 e NA <NA>
6 6 f 3 f
7 7 g 4 g
Where ideally, the missing values from df1 are dropped and the missing values from df2 are filled in with NAs. Ultimately, I want to calculate what percent of the values in col2 of df1 have a match in col2 of df2.
Use left_join from dplyr
library(dplyr)
df1 %>%
left_join(df2, by="col2", keep = TRUE)
output:
col1.x col2.x col1.y col2.y
1 1 a 1 a
2 2 b NA <NA>
3 3 c 2 c
4 4 d NA <NA>
5 5 e NA <NA>
6 6 f 3 f
7 7 g 4 g
To get the percentage of match
out <- df1 %>%
left_join(df2, by="col2", keep = TRUE)
out %>%
filter(col2.x == col2.y) %>%
nrow()/nrow(out)*100
Result:
[1] 57.14286
If you want the % of match as a new column:
df1 %>%
left_join(df2, by="col2", keep = TRUE) %>%
mutate(percentage_match = sum(col2.x == col2.y, na.rm = TRUE)/nrow(.)*100)
output:
col1.x col2.x col1.y col2.y percentage_match
1 1 a 1 a 57.14286
2 2 b NA <NA> 57.14286
3 3 c 2 c 57.14286
4 4 d NA <NA> 57.14286
5 5 e NA <NA> 57.14286
6 6 f 3 f 57.14286
7 7 g 4 g 57.14286

Spliting rows into multiple rows and adding NA when needed

I have the following dataframe:
A B C D E F G
1/2 3/4 4/5/6 7/8 9/10 11 12/13/14/15
And want to split to:
A B C D E F G
1 3 4 7 9 11 12
2 4 5 8 10 NA 13
NA NA 6 NA NA NA 14
NA NA NA NA NA NA 15
Is there any compact way to do it?
I've tought about separating each column into a list, using something such as
list_of_dfs
and for each df do:
modified_dfs %>% separate_rows(colnames(each_df), sep = "/")
then doing a merge of all dataframes created in the process...
merge(modified_dfs)
It is more easier with cSplit
library(splitstackshape)
cSplit(df1, names(df1), sep = "/", "long")
-output
A B C D E F G
<int> <int> <int> <int> <int> <int> <int>
1: 1 3 4 7 9 11 12
2: 2 4 5 8 10 NA 13
3: NA NA 6 NA NA NA 14
4: NA NA NA NA NA NA 15
data
df1 <- structure(list(A = "1/2", B = "3/4", C = "4/5/6", D = "7/8",
E = "9/10", F = 11L, G = "12/13/14/15"), class = "data.frame",
row.names = c(NA,
-1L))

Group_by id and count the consective NA's and then restart counting when a new series of NA's is encountered

I have a dataframe like this:
df <- data_frame(id = c(rep('A', 10), rep('B', 10)),
value = c(1:3, rep(NA, 2), 1:2, rep(NA, 3), 1, rep(NA, 4), 1:3, rep(NA, 2)))
I need to count the number of consective NA's in the value column. The count needs to be grouped by ID, and it needs to restart at 1 every time a new NA or new series of NA's is encountered. The exptected output should look like this:
df$expected_output <- c(rep(NA, 3), 1:2, rep(NA, 2), 1:3, NA, 1:4, rep(NA, 3), 1:2)
If anyone can give me a dplyr solution that would also be great :)
I've tried a few things but nothing is giving any sort of sensical result. Thanks in advance^!
A solution using dplyr and data.table.
library(dplyr)
library(data.table)
df2 <- df %>%
group_by(id) %>%
mutate(info = rleid(value)) %>%
group_by(id, info) %>%
mutate(expected_output = row_number()) %>%
ungroup() %>%
mutate(expected_output = ifelse(!is.na(value), NA, expected_output)) %>%
select(-info)
df2
# # A tibble: 20 x 3
# id value expected_output
# <chr> <dbl> <int>
# 1 A 1 NA
# 2 A 2 NA
# 3 A 3 NA
# 4 A NA 1
# 5 A NA 2
# 6 A 1 NA
# 7 A 2 NA
# 8 A NA 1
# 9 A NA 2
# 10 A NA 3
# 11 B 1 NA
# 12 B NA 1
# 13 B NA 2
# 14 B NA 3
# 15 B NA 4
# 16 B 1 NA
# 17 B 2 NA
# 18 B 3 NA
# 19 B NA 1
# 20 B NA 2
We can use rle to get length of groups that are or are not na, and use purrr::map2 to apply seq if they are NA and get the growing count or just fill in with NA values using rep.
library(tidyverse)
count_na <- function(x) {
r <- rle(is.na(x))
consec <- map2(r$lengths, r$values, ~ if (.y) seq(.x) else rep(NA, .x))
unlist(consec)
}
df %>%
mutate(expected_output = count_na(value))
#> # A tibble: 20 × 3
#> id value expected_output
#> <chr> <dbl> <int>
#> 1 A 1 NA
#> 2 A 2 NA
#> 3 A 3 NA
#> 4 A NA 1
#> 5 A NA 2
#> 6 A 1 NA
#> 7 A 2 NA
#> 8 A NA 1
#> 9 A NA 2
#> 10 A NA 3
#> 11 B 1 NA
#> 12 B NA 1
#> 13 B NA 2
#> 14 B NA 3
#> 15 B NA 4
#> 16 B 1 NA
#> 17 B 2 NA
#> 18 B 3 NA
#> 19 B NA 1
#> 20 B NA 2
Here is a solution using rle:
x <- rle(is.na(df$value))
df$new[is.na(df$value)] <- sequence(x$lengths[x$values])
# A tibble: 20 x 3
id value new
<chr> <dbl> <int>
1 A 1 NA
2 A 2 NA
3 A 3 NA
4 A NA 1
5 A NA 2
6 A 1 NA
7 A 2 NA
8 A NA 1
9 A NA 2
10 A NA 3
11 B 1 NA
12 B NA 1
13 B NA 2
14 B NA 3
15 B NA 4
16 B 1 NA
17 B 2 NA
18 B 3 NA
19 B NA 1
20 B NA 2
Yet another solution:
library(tidyverse)
df %>%
mutate(aux =data.table::rleid(value)) %>%
group_by(id, aux) %>%
mutate(eout = ifelse(is.na(value), row_number(), NA_real_)) %>%
ungroup %>% select(-aux)
#> # A tibble: 20 × 4
#> id value expected_output eout
#> <chr> <dbl> <int> <dbl>
#> 1 A 1 NA NA
#> 2 A 2 NA NA
#> 3 A 3 NA NA
#> 4 A NA 1 1
#> 5 A NA 2 2
#> 6 A 1 NA NA
#> 7 A 2 NA NA
#> 8 A NA 1 1
#> 9 A NA 2 2
#> 10 A NA 3 3
#> 11 B 1 NA NA
#> 12 B NA 1 1
#> 13 B NA 2 2
#> 14 B NA 3 3
#> 15 B NA 4 4
#> 16 B 1 NA NA
#> 17 B 2 NA NA
#> 18 B 3 NA NA
#> 19 B NA 1 1
#> 20 B NA 2 2

Merging datasets by id and maintain one row for each id

I am attempting to merge 2 datasets belonging to a single id with a larger dataset.
However, I am having trouble merging the two single row datasets into a single row within the larger dataset.
Is there a simple way to merge with dplyr and only overwrite values if they are NA's?
My data:
df1 <- data.frame(id=1:5, b=6:10, c=c("a", "b", "c", "d", "e"), d=c(NA, 1,2,3, 4))
df2 <- data.frame(id=6, b=2, c="f", d=NA_real_)
df3 <- data.frame(id=6, b=NA_real_, c=NA_character_, d=5, e="a")
> df1
id b c d
1 1 6 a NA
2 2 7 b 1
3 3 8 c 2
4 4 9 d 3
5 5 10 e 4
> df2
id b c d
1 6 2 f NA
> df3
id b c d e
1 6 NA <NA> 5 a
My attempt:
merge1 <- dplyr::full_join(df1, df2) %>% full_join(df3)
Desired output:
output <- data.frame(id=1:6, b=c(6:10,2), c=c("a", "b", "c", "d", "e", "f"), d=c(NA, 1,2,3, 4, 5), e=c(NA,NA, NA, NA, NA, "a"))
> output
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
As opposed to:
id b c d e
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f NA <NA>
7 6 NA <NA> 5 a
Thank you
You can try:
list(df1, df2, df3) %>%
bind_rows() %>%
group_by(id) %>%
summarise_all(~ first(na.omit(.)))
id b c d e
<dbl> <dbl> <chr> <dbl> <fct>
1 1 6 a NA <NA>
2 2 7 b 1 <NA>
3 3 8 c 2 <NA>
4 4 9 d 3 <NA>
5 5 10 e 4 <NA>
6 6 2 f 5 a
you can try
library(tidyverse)
df1 %>%
mutate_if(is.factor, as.character) %>%
bind_rows(mutate_if(df2, is.factor, as.character)) %>%
left_join(select(df3, id, d, e), by = "id") %>%
mutate(d= ifelse(is.na(d.x), d.y, d.x)) %>%
select(-d.x, -d.y)

Filtering data relative to first and last occurance of an event

I have a dataframe of an experiment, where stimulus is shown to participants, and time is measured continuously.
# reprex
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
# A tibble: 13 x 2
stim time
<chr> <int>
1 NA 0
2 NA 1
3 NA 2
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
13 NA 12
I want to create a generalized solution, using tidyverse functions to drop the data 1 second before and 2 seconds after the first and last marker, respectively. Using tidyverse, I thought this will work, but it throws an uninformative error.
df %>%
# store times for first and last stim
mutate(first_stim = drop_na(stim) %>% pull(time) %>% first(),
last_stim = drop_na(stim) %>% pull(time) %>% last()) %>%
# filter df based on new variables
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in mutate_impl(.data, dots) : bad value
So I made a pretty ugly base r code to overcome this issue by changing the mutate:
df2 <- df %>%
mutate(first_stim = .[!is.na(.$stim), "time"][1,1],
last_stim = .[!is.na(.$stim), "time"][nrow(.[!is.na(.$stim), "time"]), 1])
# A tibble: 13 x 4
stim time first_stim last_stim
<chr> <int> <tibble> <tibble>
1 NA 0 4 9
2 NA 1 4 9
3 NA 2 4 9
4 NA 3 4 9
5 a 4 4 9
6 b 5 4 9
7 NA 6 4 9
8 c 7 4 9
9 NA 8 4 9
10 d 9 4 9
11 NA 10 4 9
12 NA 11 4 9
13 NA 12 4 9
Now I would only need to filter based on the new variables first_stim - 1 and last_stim + 2. But filter fails too:
df2 %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
Error in filter_impl(.data, quo) :
Not compatible with STRSXP: [type=NULL].
I was able to do it in base R, but it is really ugly:
df2[(df2$time >= (df2[[1, "first_stim"]] - 1)) &
(df2$time <= (df2[[1, "last_stim"]] + 2))
,]
The desired output should look like this:
# A tibble: 13 x 2
stim time
<chr> <int>
4 NA 3
5 a 4
6 b 5
7 NA 6
8 c 7
9 NA 8
10 d 9
11 NA 10
12 NA 11
I believe that the errors are related to dplyr::nth() and related functions. And I've found some old issues that are related to this behavior, but should no longer exist https://github.com/tidyverse/dplyr/issues/1980
I would really appreciate if someone could highlight what is the problem, and how to do this in a tidy way.
You could use a combination of is.na and which...
library(dplyr)
df <-
tibble(stim = c(NA, NA, NA, NA, "a", "b", NA, "c", NA, "d", NA, NA, NA),
time = 0:12)
df %>%
filter(row_number() >= first(which(!is.na(stim))) - 1 &
row_number() <= last(which(!is.na(stim))) + 2)
# # A tibble: 9 x 2
# stim time
# <chr> <int>
# 1 NA 3
# 2 a 4
# 3 b 5
# 4 NA 6
# 5 c 7
# 6 NA 8
# 7 d 9
# 8 NA 10
# 9 NA 11
you could also make your first attempt work with a little modification...
df %>%
mutate(first_stim = first(drop_na(., stim) %>% pull(time)),
last_stim = last(drop_na(., stim) %>% pull(time))) %>%
filter(time >= first(first_stim) - 1 &
time <= first(last_stim) + 2)
We can create a cumulative sum of non-NA values and then find the row indices where the we encounter the first non-NA value and the last one. We then select rows based on the requirement. (-1 from start and +2 from end).
library(tidyverse)
df %>%
mutate(count_cumsum = cumsum(!is.na(stim))) %>%
slice((which.max(count_cumsum == 1) -1):(which.max(count_cumsum) + 2)) %>%
select(-count_cumsum)
# stim time
# <chr> <int>
#1 NA 3
#2 a 4
#3 b 5
#4 NA 6
#5 c 7
#6 NA 8
#7 d 9
#8 NA 10
#9 NA 11
Just to give an idea how count_cumsum looks:
df %>%
mutate(count_cumsum = cumsum(!is.na(stim)))
# A tibble: 13 x 3
# stim time count_cumsum
# <chr> <int> <int>
#1 NA 0 0
#2 NA 1 0
#3 NA 2 0
#4 NA 3 0
#5 a 4 1
#6 b 5 2
#7 NA 6 2
#8 c 7 3
#9 NA 8 3
#10 d 9 4
#11 NA 10 4
#12 NA 11 4
#13 NA 12 4

Resources