Randomly select rows in R using sample_n - r

df <- data.frame(
id = c(1:12),
day = c(1, 1, 1,1, 2, 2,2, 2, 3,3,3,3),
endpoint = c(1, 1, 1,1, 2,2,2,2,1,1,1,1))
df
#> id day endpoint
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 1 1
#> 4 4 1 1
#> 5 5 2 2
#> 6 6 2 2
#> 7 7 2 2
#> 8 8 2 2
#> 9 9 3 1
#> 10 10 3 1
#> 11 11 3 1
#> 12 12 3 1
In the above data, there some patients(id) reached the endpoint each day. I am trying to randomly select the endpoint number of patients with s = 1. For each day, ids on that day and previously days are eligible as long as not previously selected. The following code gets what I expected, but I have to manually enter day and endpoint values. Any suggestions on how to pick those values directly from the data would be appreciated.
library(dplyr)
df$s = 0
df$s <-ifelse(df$id%in%sample_n(df[df$day<=1 & df$s==0, ], 1)$id, 1, df$s)
df$s <-ifelse(df$id%in%sample_n(df[df$day<=2 & df$s==0, ], 2)$id, 1, df$s)
df$s <-ifelse(df$id%in%sample_n(df[df$day<=3 & df$s==0, ], 1)$id, 1, df$s)
df
#> id day endpoint s pick_day
#> 1 1 1 1 0 0
#> 2 2 1 1 1 2
#> 3 3 1 1 1 1
#> 4 4 1 1 1 3
#> 5 5 2 2 1 2
#> 6 6 2 2 0 0
#> 7 7 2 2 0 0
#> 8 8 2 2 0 0
#> 9 9 3 1 0 0
#> 10 10 3 1 0 0
#> 11 11 3 1 0 0
#> 12 12 3 1 0 0
EDIT
Is it possible to add a variable to show the day for which a row was picked, like the above variable pick_day? Thanks.

A way in base R using for loop :
df$s = 0
set.seed(123)
for (i in unique(df$day)) {
temp <- subset(df, day <= i & s == 0)
ids <- with(temp, sample(id, endpoint[day == i][1]))
df$s[df$id %in% ids] <- 1
}
df
# id day endpoint s
#1 1 1 1 0
#2 2 1 1 0
#3 3 1 1 1
#4 4 1 1 1
#5 5 2 2 1
#6 6 2 2 0
#7 7 2 2 0
#8 8 2 2 1
#9 9 3 1 0
#10 10 3 1 0
#11 11 3 1 0
#12 12 3 1 0

Related

R Count Unique By Group in DPLYR

HAVE = data.frame("TRIMESTER" = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,4,4,4),
"STUDENT" = c(1,2,3,3,4,2,5,6,7,1,2,2,2,2,2,1,2,3,4,5))
HAVE$WANT1 = c(4,4,4,4,4,5,5,5,5,5,1,1,1,1,5,5,5,5,5,5)
HAVE$WANT2 = c(0,0,0,0,0,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1)
I have HAVE and wish to APPEND a column to count the UNIQUE value of STUDENT for every TRIMESTER shown WANT1 and I wish to create WANT2 which is the SUM of times for every TRIMESTER that STUDENT==5 appears so STUDENT==5 appear ZERO times in TRIMESTER == 1, so the value for all TRIMESTER == 1 is ZERO but student 5 appear ONCE in TRIMESTER==4 so the value is 1
After grouping by 'TRIMESTER', get the count of distinct elements of 'STUDENT' with n_distinct and the count of STUDENT 5 with sum on a logical expression
library(dplyr)
HAVE %>%
group_by(TRIMESTER) %>%
mutate(WANT1new = n_distinct(STUDENT),
WANT2NEW = sum(STUDENT == 5)) %>%
ungroup
-output
# A tibble: 20 × 6
TRIMESTER STUDENT WANT1 WANT2 WANT1new WANT2NEW
<dbl> <dbl> <dbl> <dbl> <int> <int>
1 1 1 4 0 4 0
2 1 2 4 0 4 0
3 1 3 4 0 4 0
4 1 3 4 0 4 0
5 1 4 4 0 4 0
6 2 2 5 1 5 1
7 2 5 5 1 5 1
8 2 6 5 1 5 1
9 2 7 5 1 5 1
10 2 1 5 1 5 1
11 3 2 1 0 1 0
12 3 2 1 0 1 0
13 3 2 1 0 1 0
14 3 2 1 0 1 0
15 4 2 5 1 5 1
16 4 1 5 1 5 1
17 4 2 5 1 5 1
18 4 3 5 1 5 1
19 4 4 5 1 5 1
20 4 5 5 1 5 1
The code below should produce the desired result.
library(dplyr)
HAVE %>%
group_by(TRIMESTER) %>%
mutate(WANT1 = length(unique(STUDENT)),
WANT2 = as.numeric(any(5 == STUDENT)))

Use R to find values for which a condition is first met

Consider the following sample dataset. Id is an individual identifier.
rm(list=ls()); set.seed(1)
n<-100
X<-rbinom(n, 1, 0.5) #binary covariate
j<-rep (1:n)
dat<-data.frame(id=1:n, X)
ntp<- rep(4, n)
mat<-matrix(ncol=3,nrow=1)
m=0; w <- mat
for(l in ntp)
{
m=m+1
ft<- seq(from = 2, to = 8, length.out = l)
# ft<- seq(from = 1, to = 9, length.out = l)
ft<-sort(ft)
seq<-rep(ft,each=2)
seq<-c(0,seq,10)
matid<-cbind( matrix(seq,ncol=2,nrow=l+1,byrow=T ) ,m)
w<-rbind(w,matid)
}
d<-data.frame(w[-1,])
colnames(d)<-c("time1","time2","id")
D <- round( merge(d,dat,by="id") ,2) #merging dataset
nr<-nrow(D)
D$Survival_time<-round(rexp(nr, 0.1)+1,3)
head(D,15)
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
4 1 6 8 0 13.296
5 1 8 10 0 6.397
6 2 0 2 0 10.566
7 2 2 4 0 2.470
8 2 4 6 0 14.907
9 2 6 8 0 8.620
10 2 8 10 0 13.376
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
How can I obtain the value at which Survival_time is less that time2 for the very first time per individual. I should end up with the following values
id Survival_time
1 4.740
2 2.470
3 7.547
Also, how can I subset the data to stop individualwise when this condition occurs. i.e obtain
id time1 time2 X Survival_time
1 1 0 2 0 21.341
2 1 2 4 0 18.987
3 1 4 6 0 4.740
6 2 0 2 0 10.566
7 2 2 4 0 2.470
11 3 0 2 1 45.239
12 3 2 4 1 11.545
13 3 4 6 1 11.352
14 3 6 8 1 19.760
15 3 8 10 1 7.547
Using data.table
library(data.table)
setDT(D)[, .SD[seq_len(.N) <= which(Survival_time < time2)[1]], id]
-output
id time1 time2 X Survival_time
1: 1 0 2 0 21.341
2: 1 2 4 0 18.987
3: 1 4 6 0 4.740
4: 2 0 2 0 10.566
5: 2 2 4 0 2.470
6: 3 0 2 1 45.239
7: 3 2 4 1 11.545
8: 3 4 6 1 11.352
9: 3 6 8 1 19.760
10: 3 8 10 1 7.547
Slight variation:
library(dplyr)
D %>% # Take D, and then
group_by(id) %>% # group by id, and then
filter(Survival_time < time2) %>% # keep Survival times < time2, and then
slice(1) %>% # keep the first row per id, and then
ungroup() # ungroup
You can use -
library(dplyr)
D %>%
group_by(id) %>%
summarise(Survival_time = Survival_time[match(TRUE, Survival_time < time2)])
#Also using which.max
#summarise(Survival_time = Survival_time[which.max(Survival_time < time2)])
# id Survival_time
# <int> <dbl>
#1 1 4.74
#2 2 2.47
#3 3 7.55
To select the rows you may till that point you may use -
D %>%
group_by(id) %>%
filter(row_number() <= match(TRUE, Survival_time < time2)) %>%
ungroup
# id time1 time2 X Survival_time
# <int> <int> <int> <int> <dbl>
# 1 1 0 2 0 21.3
# 2 1 2 4 0 19.0
# 3 1 4 6 0 4.74
# 4 2 0 2 0 10.6
# 5 2 2 4 0 2.47
# 6 3 0 2 1 45.2
# 7 3 2 4 1 11.5
# 8 3 4 6 1 11.4
# 9 3 6 8 1 19.8
#10 3 8 10 1 7.55

R define a new variable as count starting when condition is met

so I´m trying to add two new variables to my dataframe. A variable named start, which is supposed to be a a running count from 0 to whatever number of rows there are for one group, and a second variable named stop which is practically the same, but starting at 1. The count should start, once the value of a second variable scores >0. It is further important, that the count continues until the last value of the group (so it shouldn´t stop if Var1=0 again) and that NAs are ignored in the sense, that counting continues.
Consider the following dataset as an example
ID Var1 start stop
1 0
1 1 0 1
1 4 1 2
1 2 2 3
1 NA 3 4
1 4 4 5
2 0
2 0
2 3 0 1
2 5 1 2
2 9 2 3
2 0 3 4
I don´t really care for the values start and stop take on before Var1>0 first, so whether it´s 0 or NA is not important
Thanks very much for the good answers in advance!!
Dirty solution to the problem, will probably work just take out the extra columns that I made as steps with select
library(tidyverse)
df_example <- read_table("ID Var1 start stop
1 0
1 1 0 1
1 4 1 2
1 2 2 3
1 NA 3 4
1 4 4 5
2 0
2 0
2 3 0 1
2 5 1 2
2 9 2 3
2 0 3 4")
df_example %>%
group_by(ID) %>%
mutate(greater_1 = if_else(replace_na(Var1,1) > 0,1,0),
run_sum = cumsum(greater_1),
to_fill = if_else(run_sum == 1,1,NA_real_)) %>%
fill(to_fill) %>%
mutate(end2 = cumsum(to_fill %>% replace_na(0)),
star2 = if_else(end2 -1 > 0,end2 -1,0))
#> # A tibble: 12 x 9
#> # Groups: ID [2]
#> ID Var1 start stop greater_1 run_sum to_fill end2 star2
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 NA NA 0 0 NA 0 0
#> 2 1 1 0 1 1 1 1 1 0
#> 3 1 4 1 2 1 2 1 2 1
#> 4 1 2 2 3 1 3 1 3 2
#> 5 1 NA 3 4 1 4 1 4 3
#> 6 1 4 4 5 1 5 1 5 4
#> 7 2 0 NA NA 0 0 NA 0 0
#> 8 2 0 NA NA 0 0 NA 0 0
#> 9 2 3 0 1 1 1 1 1 0
#> 10 2 5 1 2 1 2 1 2 1
#> 11 2 9 2 3 1 3 1 3 2
#> 12 2 0 3 4 0 3 1 4 3
Created on 2020-08-04 by the reprex package (v0.3.0)

Add a count column and count twice if a certain condition is met

I am wondering if there is a way to make a conditional column-count by a group, adding 1 to a row_number or rowid if a certain value is met (in this case 0). For example:
df<-data.frame(group=c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,3,3,3),
condition=c(1,0,1,1,1,0,0,1,1,0,1,1,0, 1),
want=c(1, 3, 4,5,1,3,5,6,7,2,3,4,6,7))
group condition want
1 1 1 1
2 1 0 3
3 1 1 4
4 1 1 5
5 2 1 1
6 2 0 3
7 2 0 5
8 2 1 6
9 2 1 7
10 3 0 2
11 3 1 3
12 3 1 4
13 3 0 6
14 3 1 7
I think this might involve making a row_number per group and then making a customized row_number but I am open to suggestions. It is kind of a work-around method to "break up" my data when a 0 appears.
Using dplyr, for each group of data (group-by(group)) we can add a column which has a counter from 1 to the length of each group (i.e. n()). By adding a cumulative sum of condition == 0, that counter will jump one more, whenever your desired condition is met.
library(dplyr)
df1 %>%
group_by(group) %>%
mutate(desired = (1:n()) + cumsum(condition == 0))
Output:
#> # A tibble: 14 x 3
#> # Groups: group [3]
#> group condition desired
#> <dbl> <dbl> <int>
#> 1 1 1 1
#> 2 1 0 3
#> 3 1 1 4
#> 4 1 1 5
#> 5 2 1 1
#> 6 2 0 3
#> 7 2 0 5
#> 8 2 1 6
#> 9 2 1 7
#> 10 3 0 2
#> 11 3 1 3
#> 12 3 1 4
#> 13 3 0 6
#> 14 3 1 7
Data:
df1 <- data.frame(group=c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,3,3,3),
condition=c(1,0,1,1,1,0,0,1,1,0,1,1,0, 1))
You can do:
transform(df, want = ave(condition, group, FUN = function(x) cumsum(x + (x == 0) * 2 )))
group condition want
1 1 1 1
2 1 0 3
3 1 1 4
4 1 1 5
5 2 1 1
6 2 0 3
7 2 0 5
8 2 1 6
9 2 1 7
10 3 0 2
11 3 1 3
12 3 1 4
13 3 0 6
14 3 1 7

If a value appears in the row, all subsequent rows should take this value (with dplyr)

I'm just starting to learn R and I'm already facing the first bigger problem.
Let's take the following panel dataset as an example:
N=5
T=3
time<-rep(1:T, times=N)
id<- rep(1:N,each=T)
dummy<- c(0,0,1,1,0,0,0,1,0,0,0,1,0,1,0)
df<-as.data.frame(cbind(id, time,dummy))
id time dummy
1 1 1 0
2 1 2 0
3 1 3 1
4 2 1 1
5 2 2 0
6 2 3 0
7 3 1 0
8 3 2 1
9 3 3 0
10 4 1 0
11 4 2 0
12 4 3 1
13 5 1 0
14 5 2 1
15 5 3 0
I now want the dummy variable for all rows of a cross section to take the value 1 after the 1 for this cross section appears for the first time. So, what I want is:
id time dummy
1 1 1 0
2 1 2 0
3 1 3 1
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 0
8 3 2 1
9 3 3 1
10 4 1 0
11 4 2 0
12 4 3 1
13 5 1 0
14 5 2 1
15 5 3 1
So I guess I need something like:
df_new<-df %>%
group_by(id) %>%
???
I already tried to set all zeros to NA and use the na.locf function, but it didn't really work.
Anybody got an idea?
Thanks!
Use cummax
df %>%
group_by(id) %>%
mutate(dummy = cummax(dummy))
# A tibble: 15 x 3
# Groups: id [5]
# id time dummy
# <dbl> <dbl> <dbl>
# 1 1 1 0
# 2 1 2 0
# 3 1 3 1
# 4 2 1 1
# 5 2 2 1
# 6 2 3 1
# 7 3 1 0
# 8 3 2 1
# 9 3 3 1
#10 4 1 0
#11 4 2 0
#12 4 3 1
#13 5 1 0
#14 5 2 1
#15 5 3 1
Without additional packages you could do
transform(df, dummy = ave(dummy, id, FUN = cummax))

Resources