I'm fairly used to adding in missing cases for data but this use case escapes me.
I have a number of dataframes (which differ slightly), an example would be:
> t1
3 4 5
2 1 0 0
3 0 2 2
4 2 6 4
5 1 2 1
structure(list(`3` = c(1L, 0L, 2L, 1L), `4` = c(0L, 2L, 6L, 2L
), `5` = c(0L, 2L, 4L, 1L)), .Names = c("3", "4", "5"), row.names = c("2",
"3", "4", "5"), class = "data.frame")
Row names & Column names should be from 1:5 and, obviously, where these were missing the cell value set to NA. For the example above this would give:
> t1
1 2 3 4 5
1 NA NA NA NA NA
2 NA NA 1 0 0
3 NA NA 0 2 2
4 NA NA 2 6 4
5 NA NA 1 2 1
In each case ANY one or more rows AND/OR columns might be missing.
I can readily get the missing columns using the method described by Josh O'Brien here but am missing the row method.
Can anyone help?
We can do this in a much easier way with base R by creating a matrix of NAs of the required dimensions and then assign the values of 't1' based on the row names and column names of 't1'
m1 <- matrix(NA, ncol=5, nrow=5, dimnames = list(1:5, 1:5))
m1[row.names(t1), colnames(t1)] <- unlist(t1)
m1
# 1 2 3 4 5
#1 NA NA NA NA NA
#2 NA NA 1 0 0
#3 NA NA 0 2 2
#4 NA NA 2 6 4
#5 NA NA 1 2 1
Or using tidyverse
library(tidyverse)
rownames_to_column(t1, "rn") %>%
gather(Var, Val, -rn) %>%
mutate_at(vars(rn, Var), as.integer) %>%
complete(rn = seq_len(max(rn)), Var = seq_len(max(Var))) %>%
spread(Var, Val)
# A tibble: 5 × 6
# rn `1` `2` `3` `4` `5`
#* <int> <int> <int> <int> <int> <int>
#1 1 NA NA NA NA NA
#2 2 NA NA 1 0 0
#3 3 NA NA 0 2 2
#4 4 NA NA 2 6 4
#5 5 NA NA 1 2 1
Based on the solution you mentioned by Josh O'Brien, you can do the same but use rownames instead of names. Take a look at the code below..
df <- data.frame(a=1:4, e=4:1)
colnms <- c("a", "b", "d", "e")
rownms <- c("1", "2", "3", "4", "5")
rownames(df) <- c("1", "3", "4", "5")
## find missing columns and replace with zero, and order them
Missing <- setdiff(colnms, names(df))
df[Missing] <- 0
df <- df[colnms]
df
## do the same for rows
MissingR <- setdiff(rownms, rownames(df))
df[MissingR,] <- 0
df <- df[rownms,]
df
# > df
# a b d e
#1 1 0 0 4
#2 0 0 0 0
#3 2 0 0 3
#4 3 0 0 2
#5 4 0 0 1
Related
I have a dataframe "df" like this:
col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1
and I would like to create a new dataframe "new_df" in which there is 1 blank column (called "empty") every 2 columns, like this:
empty col1 col2 empty col3 col4 empty col5 col6
NA 1 2 NA 2 3 NA 5 7
NA 2 4 NA 6 4 NA 8 2
NA 5 9 NA 7 3 NA 2 5
NA 3 4 NA 5 6 NA 8 1
How can I add the blank column in this way?
I have tried using:
n = length(df)
empty <- NA
for (i in seq(1,n-2,2))
{
new_df <- add_column(df, empty, .before=i)
}
but it memorizes only the last step, giving me this result:
col1 col2 col3 col4 empty col5 col6
1 2 2 3 NA 5 7
2 4 6 4 NA 8 2
5 9 7 3 NA 2 5
3 4 5 6 NA 8 1
Another base R solution
tmp1=seq(1,ncol(df),3)
tmp2=!(1:ncol(df) %in% tmp1)
df2=data.frame(matrix(NA,nrow(df),ncol(df)+ncol(df)/2))
df2[tmp2]=df
colnames(df2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(df2)[tmp2]=colnames(df)
empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
The base R solution would be:
do.call(cbind, lapply(seq(1, ncol(df), by = 2), function(i) cbind(empty = rep(NA, nrow(df)), df[, seq(i, i+1)])))
# empty col1 col2 empty col3 col4 empty col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
A "tidy" solution could be:
library(tidyverse)
map_dfc(seq(from = 1, to = ncol(df), by = 2),
~df %>%
mutate(empty = NA) %>%
select(empty, .x, .x+1))
#New names:
#* empty -> empty...1
#* empty -> empty...4
#* empty -> empty...7
#empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
Using append().
for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i))
dat
# emp col1 col2 emp.1 col3 col4 emp.2 col5 col6
# 1 NA 1 2 NA 2 3 NA 5 7
# 2 NA 2 4 NA 6 4 NA 8 2
# 3 NA 5 9 NA 7 3 NA 2 5
# 4 NA 3 4 NA 5 6 NA 8 1
Data:
dat <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), class = "data.frame", row.names = c(NA,
-4L))
And here comes the ...
Microbenchmark
# Unit: microseconds
# expr min lq mean median uq max neval cld
# ronak() 969.707 990.9945 1001.4807 1012.282 1017.368 1022.453 3 d
# user() 349.937 358.0145 364.3877 366.092 371.613 377.134 3 a
# jay() 2098.003 2100.8540 2115.7640 2103.705 2124.644 2145.584 3 e
# groth1() 2164.896 2262.5745 2363.6133 2360.253 2462.972 2565.691 3 f
# groth2() 424.546 438.0185 455.0820 451.491 470.350 489.209 3 ab
# groth3() 722.551 728.0910 733.1910 733.631 738.511 743.391 3 c
# r.user() 612.432 619.6570 636.9573 626.882 649.220 671.558 3 bc
## and with the usual expanded data frame:
set.seed(42)
dat <- dat[sample(nrow(dat), 1e6, replace=T), ]
microbenchmark::microbenchmark(ronak(), user(), jay(), groth1(), groth2(), groth3(), r.user(), times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ronak() 1375.139030 1456.858743 1564.509886 1538.578457 1659.19531 1779.81217 3 c
# user() 89.017416 200.845539 251.548652 312.673662 332.81427 352.95488 3 a
# jay() 7.655812 8.382333 9.941684 9.108855 11.08462 13.06039 3 a
# groth1() 501.263785 514.097103 621.755474 526.930421 682.00132 837.07222 3 b
# groth2() 143.438836 147.783741 189.033391 152.128645 211.83067 271.53269 3 a
# groth3() 1387.314877 1406.898863 1469.493158 1426.482849 1510.58230 1594.68175 3 c
# r.user() 1469.543881 1472.770464 1483.834022 1475.997046 1490.97909 1505.96114 3 c
Code:
ronak <- \() {
split_data <- split.default(dat,rep(seq_along(dat), each=2, length.out=ncol(dat)))
do.call(cbind, Map(function(x, y) cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data))
}
user <- \() {
tmp1=seq(1, 9,3);tmp2=!(1:9 %in% tmp1);dat2=data.frame(matrix(NA,nrow(dat),ncol(dat)+ncol(dat)/2))
dat2[tmp2]=dat;colnames(dat2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(dat2)[tmp2]=colnames(dat);dat2
}
jay <- \() {for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i));dat}
groth1 <- \() suppressMessages({
require(dplyr):require(purrr)
dat %>% split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>% map(~ bind_cols(empty=NA, .)) %>%
bind_cols
})
groth2 <- \() {
ix <- cumsum(seq_along(dat) %% 2 + 1);dat2 <- replace(data.frame(matrix(NA, nrow(dat), max(ix))), ix, dat)
names(dat2) <- replace(rep("empty", ncol(dat2)), ix, names(dat));dat2
}
groth3 <- \() {
ix <- as.numeric(gl(ncol(dat), 2, ncol(dat))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(dat, ix)))
}
r.user <- \() do.call(cbind, lapply(seq(1, ncol(dat), by=2), function(i)
cbind(empty=rep(NA, nrow(dat)), dat[, seq(i, i+1)])))
!) dplyr/purrr Split the data frame, DF, bind an NA column before each component and bind the resulting components back together. Using the same column name in multiple columns as in the sample output in the question has the problem that it is not possible to identify columns by name so this uses unique names.
library(dplyr)
library(purrr)
DF %>%
split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>%
map(~ bind_cols(empty = NA, .)) %>%
bind_cols
giving:
empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
2) Base R Create a vector ix which gives the indexes of the original data frame in the result data frame and then create an empty result and copy DF and its names into it.
ix <- cumsum(seq_along(DF) %% 2 + 1) # 2 3 5 6 8 9
DF2 <- replace(data.frame(matrix(NA, nrow(DF), max(ix))), ix, DF)
names(DF2) <- replace(rep("empty", ncol(DF2)), ix, names(DF))
DF2
giving:
empty col1 col2 empty col3 col4 empty col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
3) Base R This is another Base R solution. It roughly translates (1) into Base R. It gives the same result as (2).
ix <- as.numeric(gl(ncol(DF), 2, ncol(DF))) # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(DF, ix)))
4) eList The eList package can be used for a particularly short solution.
library(eList)
DF(for(i in seq(1, ncol(DF), 2)) list(empty = NA, DF[seq(i, len = 2)]))
giving:
empty col1 col2 empty.1 col3 col4 empty.2 col5 col6
1 NA 1 2 NA 2 3 NA 5 7
2 NA 2 4 NA 6 4 NA 8 2
3 NA 5 9 NA 7 3 NA 2 5
4 NA 3 4 NA 5 6 NA 8 1
Note
The input in reproducible form.
Lines <- "col1 col2 col3 col4 col5 col6
1 2 2 3 5 7
2 4 6 4 8 2
5 9 7 3 2 5
3 4 5 6 8 1"
DF <- read.table(text = Lines, header = TRUE)
Here's a base R option -
We can split the data every 2 columns into list of dataframe and use Map to add a new column with NA in each dataframe.
split_data <- split.default(df,rep(seq_along(df), each = 2, length.out = ncol(df)))
do.call(cbind, Map(function(x, y)
cbind(setNames(data.frame(NA), paste0('empty', x)), y),
seq_along(split_data), split_data)) -> result
result
# empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
#1 NA 1 2 NA 2 3 NA 5 7
#2 NA 2 4 NA 6 4 NA 8 2
#3 NA 5 9 NA 7 3 NA 2 5
#4 NA 3 4 NA 5 6 NA 8 1
It is not a good practice to have duplicate column names in a dataframe hence I name them as empty1, empty2 etc.
data
df <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L,
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L,
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)),
class = "data.frame", row.names = c(NA, -4L))
I have a dataframe df where:
Days Treatment A Treatment B Treatment C
0 5 1 1
1 0 2 3
2 1 1 0
For example, there were 5 individuals receiving Treatment A that survived 0 days and 1 who survived 2, etc. However, I would like it where those 5 individuals now become a unique row, with that cell representing the days they survived:
Patient # A B C
1 0
2 0
3 0
4 0
5 0
6 2
7 0
8 1
9 1
10 2
11 0
12 1
13 1
14 1
Let Patient # = an arbitrary value.
I am sorry if this is not descriptive enough, but I appreciate any and all help you have to offer! I have the dataset in Excel at the moment, but I can place it into R if that's easier.
We can replicate values the 'Days' with each of the 'Patient' column values in a list, then create a list of the sequence, use Map to construct a data.frame and finally use bind_rows
library(dplyr)
lst1 <- lapply(df[-1], function(x) rep(df$Days, x))
bind_rows(Map(function(x, y, z) setNames(data.frame(x, y),
c("Patient", z)), relist(seq_along(unlist(lst1)),
skeleton = lst1), lst1, sub("Treatment\\s+", "", names(lst1))))
-output
# Patient A B C
#1 1 0 NA NA
#2 2 0 NA NA
#3 3 0 NA NA
#4 4 0 NA NA
#5 5 0 NA NA
#6 6 2 NA NA
#7 7 NA 0 NA
#8 8 NA 1 NA
#9 9 NA 1 NA
#10 10 NA 2 NA
#11 11 NA NA 0
#12 12 NA NA 1
#13 13 NA NA 1
#14 14 NA NA 1
Or another option with reshaping into 'long' and then to 'wide'
library(tidyr)
df %>%
pivot_longer(cols = -Days) %>%
separate(name, into = c('name1', 'name2')) %>%
group_by(name2) %>%
summarise(value = rep(Days, value), .groups = 'drop') %>%
mutate(Patient = row_number()) %>%
pivot_wider(names_from = name2, values_from = value)
-output
# A tibble: 14 x 4
# Patient A B C
# <int> <int> <int> <int>
# 1 1 0 NA NA
# 2 2 0 NA NA
# 3 3 0 NA NA
# 4 4 0 NA NA
# 5 5 0 NA NA
# 6 6 2 NA NA
# 7 7 NA 0 NA
# 8 8 NA 1 NA
# 9 9 NA 1 NA
#10 10 NA 2 NA
#11 11 NA NA 0
#12 12 NA NA 1
#13 13 NA NA 1
#14 14 NA NA 1
data
df <- structure(list(Days = 0:2, `Treatment A` = c(5L, 0L, 1L),
`Treatment B` = c(1L,
2L, 1L), `Treatment C` = c(1L, 3L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
I'm a beginner in R and I'm facing an issue.
Problem: I need to sort a dataframe by 2 columns (ID, i'th column) and then take lagged difference of the i'th column and record it. Then resort the data with the ID and the i+1 column and so on and so forth.
What I have written up till now:
for (val in (4:length(colnames(df)))){
df <- df[with(df, order(ID, df[val])), ]
d2_df <- df %>%
mutate_at(c(df[val]), list(lagged = ~ . - lag(.)))
}
The above code is messing somehow because the mutate_at function is throwing the error below:
Error: `.vars` must be a character/numeric vector or a `vars()` object, not a list.
Original dataset:
ID S1 S2
1 1 3 1
2 1 5 2
3 1 1 3
4 2 2 7
5 3 4 9
6 3 2 11
After Sort on ID and S1
ID S1 S2
1 1 1 3
2 1 3 1
3 1 5 2
4 2 2 7
5 3 2 11
6 3 4 9
Now what I need? S1.1 (which is the lagged difference of the sorted dataframe respective to each ID)
ID S1 S2 S1.1
1 1 1 3 NA
2 1 3 1 2
3 1 5 2 2
4 2 2 7 NA
5 3 2 11 NA
6 3 4 9 2
Similar logic applies for S2 where a new S2.2 will be generated.
Any help would be immensely appreciated.
Additionally what is required (below); where sum.S1 is the sum of the lagged differences and count.S1 is the count of observations at S1 for respective ID:
ID sum.S1 sum.S2 count.S1 count.S2
1 1 4 2 3 3
2 2 NA NA 1 1
3 3 2 2 2 2
Here's a way using non-standard evaluation (NSE) :
library(dplyr)
library(purrr)
library(rlang)
cols <- c('S1', 'S2')
bind_cols(df, map_dfc(cols, ~{
col <- sym(.x)
df %>%
arrange(ID, !!col) %>%
group_by(ID) %>%
transmute(!!paste0(.x, '.1') := !!col - lag(!!col)) %>%
ungroup %>%
select(-ID)
}))
# ID S1 S2 S1.1 S2.1
#1 1 3 1 NA NA
#2 1 5 2 2 1
#3 1 1 3 2 1
#4 2 2 7 NA NA
#5 3 4 9 NA NA
#6 3 2 11 2 2
data
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), S1 = c(3L, 5L,
1L, 2L, 4L, 2L), S2 = c(1L, 2L, 3L, 7L, 9L, 11L)),
class = "data.frame", row.names = c(NA, -6L))
I have a large data.table in the format below
Name Value 1 2 3 4 5
A 58 1 NA NA NA NA
B 47 NA 1 NA NA NA
C 89 NA NA 1 NA NA
D 68 NA NA NA 1 NA
E 75 NA NA NA NA 1
I would like to forward rows of the data table to achieve below results. I know how to forward fill columns.
Name Value 1 2 3 4 5
A 58 1 1 1 1 1
B 47 NA 1 1 1 1
C 89 NA NA 1 1 1
D 68 NA NA NA 1 1
E 75 NA NA NA NA 1
Help!
data.table has it's own nafill function.
library(data.table) #v>=1.12.8
library(magrittr)
melt(dt, id = 1:2) %>%
.[, value := nafill(value, "locf"), by = Name] %>%
dcast(., ... ~ variable)
# Name Value 1 2 3 4 5
# 1: A 58 1 1 1 1 1
# 2: B 47 NA 1 1 1 1
# 3: C 89 NA NA 1 1 1
# 4: D 68 NA NA NA 1 1
# 5: E 75 NA NA NA NA 1
Data
dt <- fread("Name Value 1 2 3 4 5
A 58 1 NA NA NA NA
B 47 NA 1 NA NA NA
C 89 NA NA 1 NA NA
D 68 NA NA NA 1 NA
E 75 NA NA NA NA 1")
Use fill in tidyr to fill in missing values with previous value.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(3:7) %>%
group_by(Name) %>%
fill(value) %>%
ungroup() %>%
pivot_wider()
# # A tibble: 5 x 7
# Name Value `1` `2` `3` `4` `5`
# <fct> <int> <int> <int> <int> <int> <int>
# 1 A 58 1 1 1 1 1
# 2 B 47 NA 1 1 1 1
# 3 C 89 NA NA 1 1 1
# 4 D 68 NA NA NA 1 1
# 5 E 75 NA NA NA NA 1
Note: The output above is the same as
df %>% fill(3:7, .direction = "up")
but the logic is different. The former belongs to "filling rows forward" and the latter is "filling columns backward". They will differ in other cases.
Data
df <- structure(list(Name = structure(1:5, .Label = c("A", "B", "C",
"D", "E"), class = "factor"), Value = c(58L, 47L, 89L, 68L, 75L
), `1` = c(1L, NA, NA, NA, NA), `2` = c(NA, 1L, NA, NA, NA),
`3` = c(NA, NA, 1L, NA, NA), `4` = c(NA, NA, NA, 1L, NA),
`5` = c(NA, NA, NA, NA, 1L)), class = "data.frame", row.names = c(NA, -5L))
So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")
Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.
Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4
You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4
dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4
Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]