Replace duplicated values with NA by group - r

I have a data frame like so:
subject <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5)
day <- c(20, 20, 20 , 20, 20, 40 , 40 , 40 , 40 , 50, 50, 50, 40, 40, 40, 40, 20, 20)
ex <- data.frame(subject, day)
Within each subject, I want to change duplicate 'day' to NA:
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA

library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(duplicated(day), NA, day)) %>%
ungroup()
# # A tibble: 18 × 2
# subject day
# <dbl> <dbl>
# 1 1 20
# 2 1 NA
# 3 1 NA
# 4 1 NA
# 5 1 NA
# 6 2 40
# 7 2 NA
# 8 2 NA
# 9 2 NA
# 10 3 50
# 11 3 NA
# 12 3 NA
# 13 4 40
# 14 4 NA
# 15 4 NA
# 16 4 NA
# 17 5 20
# 18 5 NA

library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(row_number()==1, day, NA_real_)) %>%
ungroup()
subject day
<dbl> <dbl>
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA

We may use
ex$day <- NA^duplicated(ex) * ex$day
-output
> ex
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA

Related

Fill missing values (NA) before the first non-NA value by group

I have a data frame grouped by 'id' and a variable 'age' which contains missing values, NA.
Within each 'id', I want to replace missing values of 'age', but only "fill up" before the first non-NA value.
data <- data.frame(id=c(1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3),
age=c(NA,6,NA,8,NA,NA,NA,NA,3,8,NA,NA,NA,7,NA,9))
id age
1 1 NA
2 1 6 # first non-NA in id = 1. Fill up from here
3 1 NA
4 1 8
5 1 NA
6 1 NA
7 2 NA
8 2 NA
9 2 3 # first non-NA in id = 2. Fill up from here
10 2 8
11 2 NA
12 3 NA
13 3 NA
14 3 7 # first non-NA in id = 3. Fill up from here
15 3 NA
16 3 9
Expected output:
1 1 6
2 1 6
3 1 NA
4 1 8
5 1 NA
6 1 NA
7 2 3
8 2 3
9 2 3
10 2 8
11 2 NA
12 3 7
13 3 7
14 3 7
15 3 NA
16 3 9
I tried using fill with .direction = "up" like this:
library(dplyr)
library(tidyr)
data1 <- data %>% group_by(id) %>%
fill(!is.na(age[1]), .direction = "up")
You could use cumall(is.na(age)) to find the positions before the first non-NA value.
library(dplyr)
data %>%
group_by(id) %>%
mutate(age2 = replace(age, cumall(is.na(age)), age[!is.na(age)][1])) %>%
ungroup()
# A tibble: 16 × 3
id age age2
<dbl> <dbl> <dbl>
1 1 NA 6
2 1 6 6
3 1 NA NA
4 1 8 8
5 1 NA NA
6 1 NA NA
7 2 NA 3
8 2 NA 3
9 2 3 3
10 2 8 8
11 2 NA NA
12 3 NA 7
13 3 NA 7
14 3 7 7
15 3 NA NA
16 3 9 9
Another option (agnostic about where the missing and non-missing values start) could be:
data %>%
group_by(id) %>%
mutate(rleid = with(rle(is.na(age)), rep(seq_along(lengths), lengths)),
age2 = ifelse(rleid == min(rleid[is.na(age)]),
age[rleid == (min(rleid[is.na(age)]) + 1)][1],
age))
id age rleid age2
<dbl> <dbl> <int> <dbl>
1 1 NA 1 6
2 1 6 2 6
3 1 NA 3 NA
4 1 8 4 8
5 1 NA 5 NA
6 1 NA 5 NA
7 2 NA 1 3
8 2 NA 1 3
9 2 3 2 3
10 2 8 2 8
11 2 NA 3 NA
12 3 NA 1 7
13 3 NA 1 7
14 3 7 2 7
15 3 NA 3 NA
16 3 9 4 9

Create run-length ID for subset of values

In this type of dataframe:
df <- data.frame(
x = c(3,3,1,12,2,2,10,10,10,1,5,5,2,2,17,17)
)
how can I create a new column recording the run-length ID of only a subset of x values, say, 3-20?
My own attempt only succeeds at inserting NA where the run-length count should be interrupted; but internally it seems the count is uninterrupted:
library(data.table)
df %>%
mutate(rle = ifelse(x %in% 3:20, rleid(x), NA))
x rle
1 3 1
2 3 1
3 1 NA
4 12 3
5 2 NA
6 2 NA
7 10 5
8 10 5
9 10 5
10 1 NA
11 5 7
12 5 7
13 2 NA
14 2 NA
15 17 9
16 17 9
The expected result:
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
In base R:
df[df$x %in% 3:20, "rle"] <- data.table::rleid(df[df$x %in% 3:20, ])
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
With left_join:
left_join(df, df %>%
filter(x %in% 3:20) %>%
distinct() %>%
mutate(rle = row_number()))
Joining, by = "x"
x rle
1 3 1
2 3 1
3 1 NA
4 12 2
5 2 NA
6 2 NA
7 10 3
8 10 3
9 10 3
10 1 NA
11 5 4
12 5 4
13 2 NA
14 2 NA
15 17 5
16 17 5
With data.table:
library(data.table)
setDT(df)
df[x %between% c(3,20),rle:=rleid(x)][]
x rle
<num> <int>
1: 3 1
2: 3 1
3: 1 NA
4: 12 2
5: 2 NA
6: 2 NA
7: 10 3
8: 10 3
9: 10 3
10: 1 NA
11: 5 4
12: 5 4
13: 2 NA
14: 2 NA
15: 17 5
16: 17 5

R apply function to groups within data frame adding result as additional column

Here is the code for my example dataset.
df = data.frame("group" =c(rep(1,5),rep(1,6),rep(2,4),rep(2,3)), "time" = c(rep(NA,5),seq(1,6),rep(NA,4),seq(1,3)), "p" = seq(1,18) )
group time p
1 1 NA 1
2 1 NA 2
3 1 NA 3
4 1 NA 4
5 1 NA 5
6 1 1 6
7 1 2 7
8 1 3 8
9 1 4 9
10 1 5 10
11 1 6 11
12 2 NA 12
13 2 NA 13
14 2 NA 14
15 2 NA 15
16 2 1 16
17 2 2 17
18 2 3 18
I would like to figure out how to apply a function by group to only the values that have time then append the result as a new column in the data frame. Here is my example function I would like to apply.
pfunc <- function(p){
p+5
}
The output I am hoping to obtain would look as follows.
group time p new_p
1 1 NA 1 NA
2 1 NA 2 NA
3 1 NA 3 NA
4 1 NA 4 NA
5 1 NA 5 NA
6 1 1 6 11
7 1 2 7 12
8 1 3 8 13
9 1 4 9 14
10 1 5 10 15
11 1 6 11 16
12 2 NA 12 NA
13 2 NA 13 NA
14 2 NA 14 NA
15 2 NA 15 NA
16 2 1 16 21
17 2 2 17 22
18 2 3 18 23
You can try this:
library(dplyr)
df %>% group_by(group) %>%
mutate(pnew=ifelse(is.na(time),time,time+5))
# A tibble: 18 x 4
# Groups: group [2]
group time p pnew
<dbl> <int> <int> <dbl>
1 1 NA 1 NA
2 1 NA 2 NA
3 1 NA 3 NA
4 1 NA 4 NA
5 1 NA 5 NA
6 1 1 6 6
7 1 2 7 7
8 1 3 8 8
9 1 4 9 9
10 1 5 10 10
11 1 6 11 11
12 2 NA 12 NA
13 2 NA 13 NA
14 2 NA 14 NA
15 2 NA 15 NA
16 2 1 16 6
17 2 2 17 7
18 2 3 18 8
Update
You can use this function:
increase <- function(data,n)
{
data %>% group_by(group) %>%
mutate(pnew=ifelse(is.na(time),time,time+n)) -> result
return(result)
}
increase(df,n = 10)
# A tibble: 18 x 4
# Groups: group [2]
group time p pnew
<dbl> <int> <int> <dbl>
1 1 NA 1 NA
2 1 NA 2 NA
3 1 NA 3 NA
4 1 NA 4 NA
5 1 NA 5 NA
6 1 1 6 11
7 1 2 7 12
8 1 3 8 13
9 1 4 9 14
10 1 5 10 15
11 1 6 11 16
12 2 NA 12 NA
13 2 NA 13 NA
14 2 NA 14 NA
15 2 NA 15 NA
16 2 1 16 11
17 2 2 17 12
18 2 3 18 13
Update 2
I hope this helps:
df %>% group_by(group) %>% rowwise() %>% mutate(pnew=ifelse(is.na(time),NA,pfunc(time)))
# A tibble: 18 x 4
# Rowwise: group
group time p pnew
<dbl> <int> <int> <dbl>
1 1 NA 1 NA
2 1 NA 2 NA
3 1 NA 3 NA
4 1 NA 4 NA
5 1 NA 5 NA
6 1 1 6 6
7 1 2 7 7
8 1 3 8 8
9 1 4 9 9
10 1 5 10 10
11 1 6 11 11
12 2 NA 12 NA
13 2 NA 13 NA
14 2 NA 14 NA
15 2 NA 15 NA
16 2 1 16 6
17 2 2 17 7
18 2 3 18 8

Fill subset of rows with values from row above

I have a long format dataset with longitudinal data and for one variable I want to fill in the missings in timepoint 0 with the values in timepoint 1, but I do not want to fill in the missings from timepoint 1 with values from timepoint 2 and so on.
My dataset is ordered by id and timepoint.
I have used the fill function succesfully in cases where I just needed to fill missings from all timepoints from a specific id.
Example dataframe:
df <- data.frame(id=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),
timepoint=c(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3),
var1=c(NA,9,8,10, NA, 10, NA, 12, NA, NA, 12, 11, NA, 12, 12, NA))
> df
id timepoint var1
1 1 0 NA
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 NA
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 NA
14 4 1 12
15 4 2 12
16 4 3 NA
This is what works when I just need to fill any missing no matter the timepoint:
library(dplyr)
library(tidyr)
df <- df %>%
group_by(id) %>%
fill(`var9`:`var12`, .direction = "up") %>%
as.data.frame
But now I have trouble specifying to only fill in the missings in rows at timepoint 0. Any help is appreciated.
My expected output:
> df
id timepoint var1
1 1 0 9
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 10
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 12
14 4 1 12
15 4 2 12
16 4 3 NA
This might be an oversimplification, but you can just call the fill function again, but this time with direction down. Then your entire data frame will be complete.
df <- data.frame(id=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4),
timepoint=c(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3),
var1=c(NA,9,8,10, NA, 10, NA, 12, NA, NA, 12, 11, NA, 12, 12, NA))
In this case I will use an ifelse statement followed the by the lead function.
library(dplyr); library(tidyr);
df %>%
group_by(id) %>%
mutate(var1 = ifelse(is.na(var1) & timepoint == 0,
lead(var1, 1), var1))
Yields:
# A tibble: 16 x 3
# Groups: id [4]
id timepoint var1
<dbl> <dbl> <dbl>
1 1 0 9
2 1 1 9
3 1 2 8
4 1 3 10
5 2 0 10
6 2 1 10
7 2 2 NA
8 2 3 12
9 3 0 NA
10 3 1 NA
11 3 2 12
12 3 3 11
13 4 0 12
14 4 1 12
15 4 2 12
16 4 3 NA
We can group_by id and use replace to change the values where timepoint = 0 & var1 is NA from the corresponding value of var1 where timepoint = 1 in each group.
library(dplyr)
df %>%
group_by(id) %>%
mutate(var2 = replace(var1, timepoint == 0 & is.na(var1), var1[timepoint == 1]))
# id timepoint var1 var2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 0 NA 9
# 2 1 1 9 9
# 3 1 2 8 8
# 4 1 3 10 10
# 5 2 0 NA 10
# 6 2 1 10 10
# 7 2 2 NA NA
# 8 2 3 12 12
# 9 3 0 NA NA
#10 3 1 NA NA
#11 3 2 12 12
#12 3 3 11 11
#13 4 0 NA 12
#14 4 1 12 12
#15 4 2 12 12
#16 4 3 NA NA

R merge() follow up conditional replacement of non-unique entries

I would greatly appreciate a solution to defined problem below; I think it's a very difficult one.
I join 2 data.frames t1 and t2 using merge(). In the resulting data.frame, which I named "testing", I want to replace the entries of the non-unique rows originating from t1 with "NA" so that only unique rows remain that have the closest distance to t2. The condition is:
min(sqrt((xCor.y - xCor.x)^2 + (yCor.y - yCor.x)^2))
# scroll to end for result I am looking for
This is meant for ~1GB data set, so I have to avoid looping through all data.
t1<- data.frame(trackLabel = c(1, 2, 3, 4, 4, 5, 5, 7, 7, 7),
objNumber = 1:10,
parentObjNumber = rep(0, 10),
time = rep(1,10),
xCor = runif(10),
yCor = runif(10))
t2<- data.frame(trackLabel = c(1, 2, 2, 4, 4, 4, 6, 7, 7, 7, 7),
objNumber = 11:21,
parentObjNumber = c(1, 2, 2, 4, 4, 4, 7, 8, 9, 9, 9),
time = rep(2,11),
xCor = runif(11),
yCor = runif(11))
testing<- merge(t1, t2,
by.x = c("trackLabel", "objNumber"),
by.y = c("trackLabel", "parentObjNumber"),
all = TRUE,
incomparables = NA)
#Warning message:
#In merge.data.frame(t1, t2, by.x = c("trackLabel", "objNumber"), :
# column name ‘objNumber’ is duplicated in the result
ind<-colnames(testing)=="objNumber"
colnames(testing)[min(which(ind == TRUE )) ] <- paste("objNumber", 1, sep = "")
> t1
trackLabel objNumber parentObjNumber time xCor yCor
1 1 1 0 1 0.25852366 0.360631607
2 2 2 0 1 0.69987607 0.048360258
3 3 3 0 1 0.23047883 0.414221880
4 4 4 0 1 0.58169548 0.718223111
5 4 5 0 1 0.61419336 0.435153774
6 5 6 0 1 0.50028765 0.735970291
7 5 7 0 1 0.41380332 0.097256739
8 7 8 0 1 0.57563080 0.828142024
9 7 9 0 1 0.39512092 0.728903233
10 7 10 0 1 0.16675690 0.284307824
> t2
trackLabel objNumber parentObjNumber time xCor yCor
1 1 11 1 2 0.473735625 0.454637752
2 2 12 2 2 0.623971860 0.517089522
3 2 13 2 2 0.470885840 0.703872484
4 4 14 4 2 0.188280842 0.678683831
5 4 15 4 2 0.198772198 0.160836676
6 4 16 4 2 0.251950005 0.958747183
7 6 17 7 2 0.545521560 0.005505346
8 7 18 8 2 0.477450908 0.819060935
9 7 19 9 2 0.509430458 0.997968108
10 7 20 9 2 0.027918865 0.138014769
11 7 21 9 2 0.568532497 0.911921770
> testing
trackLabel objNumber1 parentObjNumber time.x xCor.x yCor.x objNumber time.y xCor.y yCor.y
1 1 1 0 1 0.25852366 0.360631607 11 2 0.473735625 0.454637752
2 2 2 0 1 0.69987607 0.048360258 12 2 0.623971860 0.517089522
3 2 2 0 1 0.69987607 0.048360258 13 2 0.470885840 0.703872484
4 3 3 0 1 0.23047883 0.414221880 NA NA NA NA
5 4 4 0 1 0.58169548 0.718223111 14 2 0.188280842 0.678683831
6 4 4 0 1 0.58169548 0.718223111 15 2 0.198772198 0.160836676
7 4 4 0 1 0.58169548 0.718223111 16 2 0.251950005 0.958747183
8 4 5 0 1 0.61419336 0.435153774 NA NA NA NA
9 5 6 0 1 0.50028765 0.735970291 NA NA NA NA
10 5 7 0 1 0.41380332 0.097256739 NA NA NA NA
11 6 7 NA NA NA NA 17 2 0.545521560 0.005505346
12 7 8 0 1 0.57563080 0.828142024 18 2 0.477450908 0.819060935
13 7 9 0 1 0.39512092 0.728903233 19 2 0.509430458 0.997968108
14 7 9 0 1 0.39512092 0.728903233 20 2 0.027918865 0.138014769
15 7 9 0 1 0.39512092 0.728903233 21 2 0.568532497 0.911921770
16 7 10 0 1 0.16675690 0.284307824 NA NA NA NA
# and here is what I want to achieve:
> testing[c(3, 6, 7, 13, 14 ), 1:6] <-NA
> testing
trackLabel objNumber1 parentObjNumber time.x xCor.x yCor.x objNumber time.y xCor.y yCor.y
1 1 1 0 1 0.25852366 0.360631607 11 2 0.473735625 0.454637752
2 2 2 0 1 0.69987607 0.048360258 12 2 0.623971860 0.517089522
3 NA NA NA NA NA NA 13 2 0.470885840 0.703872484
4 3 3 0 1 0.23047883 0.414221880 NA NA NA NA
5 4 4 0 1 0.58169548 0.718223111 14 2 0.188280842 0.678683831
6 NA NA NA NA NA NA 15 2 0.198772198 0.160836676
7 NA NA NA NA NA NA 16 2 0.251950005 0.958747183
8 4 5 0 1 0.61419336 0.435153774 NA NA NA NA
9 5 6 0 1 0.50028765 0.735970291 NA NA NA NA
10 5 7 0 1 0.41380332 0.097256739 NA NA NA NA
11 6 7 NA NA NA NA 17 2 0.545521560 0.005505346
12 7 8 0 1 0.57563080 0.828142024 18 2 0.477450908 0.819060935
13 NA NA NA NA NA NA 19 2 0.509430458 0.997968108
14 NA NA NA NA NA NA 20 2 0.027918865 0.138014769
15 7 9 0 1 0.39512092 0.728903233 21 2 0.568532497 0.911921770
16 7 10 0 1 0.16675690 0.284307824 NA NA NA NA

Resources