How to replace single occurances with the previous status - r

I have a data table like below :
table=data.table(x=c(1:15),y=c(1,1,1,3,1,1,2,1,2,2,3,3,3,3,3),z=c(1:15)*3)
I have to clean this data table where there are single occurrences like a 3 in between the 1s and a 1 in between the 2s. It doesn't have to be a 3 but any number which occurs only once should be replaced by the previous number.
table=data.table(x=c(1:15),y=c(1,1,1,1,1,1,2,2,2,2,3,3,3,3,3),z=c(1:15)*3)
This is the expected data table.
Any help is appreciated.

Here's one way :
library(data.table)
#Count number of rows for each group
table[, N := .N, rleid(y)]
#Change `y` value which have only one row
table[, y := replace(y, N ==1, NA)]
#Replace NA with last non-NA value
table[, y := zoo::na.locf(y)][, N := NULL]
table
# x y z
# 1: 1 1 3
# 2: 2 1 6
# 3: 3 1 9
# 4: 4 1 12
# 5: 5 1 15
# 6: 6 1 18
# 7: 7 2 21
# 8: 8 2 24
# 9: 9 2 27
#10: 10 2 30
#11: 11 3 33
#12: 12 3 36
#13: 13 3 39
#14: 14 3 42
#15: 15 3 45

Here is a base R option
inds <- which(diff(c(head(table$y,1),table$y))*diff(c(table$y,tail(table$y,1)))<0)
table$y <- replace(table$y,inds,table$y[inds-1])
such that
> table
x y z
1: 1 1 3
2: 2 1 6
3: 3 1 9
4: 4 1 12
5: 5 1 15
6: 6 1 18
7: 7 2 21
8: 8 2 24
9: 9 2 27
10: 10 2 30
11: 11 3 33
12: 12 3 36
13: 13 3 39
14: 14 3 42
15: 15 3 45

Related

Generate a new group ID every time a variable changes its value [duplicate]

This question already has answers here:
Increase counter by 1 for each unique group of values
(2 answers)
Closed 4 years ago.
Have been searching for a while, but haven't found what I wanted. For the sake of exposition, suppose one has a data set like the following:
library(data.table)
set.seed(666)
foo = data.table(id = 1:20, value = sample(c(1, -1), 20, replace = T))
id value
1: 1 -1
2: 2 1
3: 3 -1
4: 4 1
5: 5 1
6: 6 -1
7: 7 -1
8: 8 1
9: 9 1
10: 10 1
11: 11 -1
12: 12 1
13: 13 1
14: 14 1
15: 15 1
16: 16 -1
17: 17 1
18: 18 -1
19: 19 1
20: 20 1
I want to create a unique group id every time value changes, resulting in
id value grp
1: 1 -1 1
2: 2 1 2
3: 3 -1 3
4: 4 1 4
5: 5 1 4
6: 6 -1 5
7: 7 -1 5
8: 8 1 6
9: 9 1 6
10: 10 1 6
11: 11 -1 7
12: 12 1 8
13: 13 1 8
14: 14 1 8
15: 15 1 8
16: 16 -1 9
17: 17 1 10
18: 18 -1 11
19: 19 1 12
20: 20 1 12
I can do it in a loop
foo[, cc := value == shift(value)][is.na(cc), cc := FALSE]
for(i in 1:nrow(foo)){
if(foo[i]$cc != T){
pp = i
foo[i, grp := pp]} else {
foo[i, grp := pp]}
}
foo[, grp := as.numeric(as.factor(grp))]
Is there a smarter way of doing it?
We can use rleid
foo[, grp := rleid(value)]
foo
# id value grp
# 1: 1 -1 1
# 2: 2 1 2
# 3: 3 -1 3
# 4: 4 1 4
# 5: 5 1 4
# 6: 6 -1 5
# 7: 7 -1 5
# 8: 8 1 6
# 9: 9 1 6
#10: 10 1 6
#11: 11 -1 7
#12: 12 1 8
#13: 13 1 8
#14: 14 1 8
#15: 15 1 8
#16: 16 -1 9
#17: 17 1 10
#18: 18 -1 11
#19: 19 1 12
#20: 20 1 12

data.table manipulation and merging

I have data
dat1 <- data.table(id=1:8,
group=c(1,1,2,2,2,3,3,3),
value=c(5,6,10,11,12,20,21,22))
dat2 <- data.table(group=c(1,2,3),
value=c(3,6,13))
and I would like to subtract dat2$value from each of the dat1$value, based on group.
Is this possible using data.table or does it require additional packages?
With data.table, you could do:
library(data.table)
dat1[dat2, on = "group"][, new.value := value - i.value, by = "group"][]
Which returns:
id group value i.value new.value
1: 1 1 5 3 2
2: 2 1 6 3 3
3: 3 2 10 6 4
4: 4 2 11 6 5
5: 5 2 12 6 6
6: 6 3 20 13 7
7: 7 3 21 13 8
8: 8 3 22 13 9
Alternatively, you can do this in one step as akrun mentions:
dat1[dat2, newvalue := value - i.value, on = .(group)]
id group value newvalue
1: 1 1 5 2
2: 2 1 6 3
3: 3 2 10 4
4: 4 2 11 5
5: 5 2 12 6
6: 6 3 20 7
7: 7 3 21 8
8: 8 3 22 9

applying cumsum() from different starting points

I have data
library(data.table)
set.seed(42)
t <- data.table(time=1:1000, value=runif(100,0,1))
p <- data.table(id=1:10, cut=sample(1:100,5))
vals <- 1:5
> head(t)
time value
1: 1 0.9148060
2: 2 0.9370754
3: 3 0.2861395
4: 4 0.8304476
5: 5 0.6417455
6: 6 0.5190959
> head(p)
id cut
1: 1 63
2: 2 22
3: 3 99
4: 4 38
5: 5 91
6: 6 63
> vals
[1] 1 2 3 4 5
where t gives some vector of values associated with time points, and p gives for each person a cutoff in time.
I would like to get for each person the time units it takes to accumulate each of the values in vals.
My approach now is to use a for-loop that computes for each person a temporary vector of cumulative sums, starting at its specific cutoff in time. Next, I use findInterval() to obtain the positions at which cumsum reaches each of the levels in vals.
out <- matrix(NA, nrow=nrow(p), ncol=length(vals)); colnames(out) <- vals
for(i in 1:nrow(p)){
temp <- cumsum(t$value[t$time > p$cut[i]]); temp <- temp[!is.na(temp)]
out[i,] <- findInterval(vals,temp)
}
which should yield
1 2 3 4 5
[1,] 1 4 5 9 12
[2,] 1 2 5 6 7
[3,] 1 2 4 5 7
[4,] 1 3 5 7 8
[5,] 2 3 5 7 8
[6,] 1 4 5 9 12
[7,] 1 2 5 6 7
[8,] 1 2 4 5 7
[9,] 1 3 5 7 8
[10,] 2 3 5 7 8
This is of course heavily inefficient and doesn't do justice to the powers of R. Is there a way of speeding this up?
I'd do
# precompute cumsum on full table
t[, cs := cumsum(value)]
# compute one time per unique cut value, not per id
cuts = unique(p[, .(t_cut = cut)])
# look up value at cut time
cuts[t, on=.(t_cut = time), v_cut := i.cs]
# look up time at every cut value combo
cutres = cuts[, .(pt = vals + v_cut), by=t_cut][, .(
t_cut,
v = vals,
t_plus = t[.SD, on=.(cs = pt), roll=TRUE, x.time] - t_cut
)]
which gives
t_cut v t_plus
1: 63 1 1
2: 63 2 4
3: 63 3 5
4: 63 4 9
5: 63 5 12
6: 22 1 1
7: 22 2 2
8: 22 3 5
9: 22 4 6
10: 22 5 7
11: 99 1 1
12: 99 2 2
13: 99 3 4
14: 99 4 5
15: 99 5 7
16: 38 1 1
17: 38 2 3
18: 38 3 5
19: 38 4 7
20: 38 5 8
21: 91 1 2
22: 91 2 3
23: 91 3 5
24: 91 4 7
25: 91 5 8
t_cut v t_plus
If you want to map this back to id and get it in a id x vals table...
cutres[p, on=.(t_cut = cut), allow.cartesian=TRUE,
dcast(.SD, id ~ v, value.var = "t_plus")]
id 1 2 3 4 5
1: 1 1 4 5 9 12
2: 2 1 2 5 6 7
3: 3 1 2 4 5 7
4: 4 1 3 5 7 8
5: 5 2 3 5 7 8
6: 6 1 4 5 9 12
7: 7 1 2 5 6 7
8: 8 1 2 4 5 7
9: 9 1 3 5 7 8
10: 10 2 3 5 7 8
(Alternately, the key part can be done like t_plus = t[.SD, on=.(cs = pt), roll=TRUE, which=TRUE] - t_cut since t$time is the row number.)

How to add the cluster ID to each component (igraph)

I know how to use igraph package in R to obtain connected components in two columns in data set.
data set
library(data.table)
df = fread(
"rn A B
1: 11 6
2: 12 6
3: 11 7
4: 13 2
5: 12 7
6: 12 8
7: 17 2
8: 13 1")[, rn := NULL][]
library(igraph)
g = graph_from_data_frame(df)
cluster = clusters(g)
list = groups(cluster)
What I want to do next is to assign the cluster ID to each connected component.
A B ID
1: 11 6 1
2: 12 6 1
3: 11 7 1
4: 13 2 2
5: 12 7 1
6: 12 8 1
7: 17 2 2
8: 13 1 2
I hope this makes sense. Thank you
You can extract the membership by doing either:
df$ID <- cluster$membership[as.character(df$A)]
Or
df$ID <- cluster$membership[as.character(df$B)]
Both should give:
df
# A B ID
#1: 11 6 1
#2: 12 6 1
#3: 11 7 1
#4: 13 2 2
#5: 12 7 1
#6: 12 8 1
#7: 17 2 2
#8: 13 1 2

R data table combining lapply with other j arguments

I want to combine the result of lapply using .SD in j with further output columns in j. How can I do that in the same data table?
So far Im creating two data tables (example_summary1, example_summary2) and merge them but there should be a better way?
Maybe I don't fully understand the concept of .SD/.SDcols.
example <-data.table(id=rep(1:5,3),numbers=rep(1:5,3),sample1=sample(20,15,repla ce=TRUE),sample2=sample(20,15,replace=100))
id numbers sample1 sample2
1: 1 1 17 18
2: 2 2 8 1
3: 3 3 17 12
4: 4 4 15 2
5: 5 5 14 18
6: 1 1 11 14
7: 2 2 12 12
8: 3 3 11 7
9: 4 4 16 13
10: 5 5 17 1
11: 1 1 10 3
12: 2 2 14 15
13: 3 3 13 3
14: 4 4 17 6
15: 5 5 1 5
example_summary1<-example[,lapply(.SD,mean),by=id,.SDcols=c("sample1","sample2")]
> example_summary1
id sample1 sample2
1: 1 12.66667 11.666667
2: 2 11.33333 9.333333
3: 3 13.66667 7.333333
4: 4 16.00000 7.000000
5: 5 10.66667 8.000000
example_summary2<-example[,.(example.sum=sum(numbers)),id]
> example_summary2
id example.sum
1: 1 3
2: 2 6
3: 3 9
4: 4 12
5: 5 15
This is the best you can do if you are using .SDcols:
example_summary1 <- example[, c(lapply(.SD, mean), .(example.sum = sum(numbers))),
by = id, .SDcols = c("sample1", "sample2", "numbers")][, numbers := NULL][]
If you don't include numbers in .SDcols it's not available in j.
Without .SDcols you can do this:
example_summary1 <- example[, c(lapply(.(sample1 = sample1, sample2 = sample2), mean),
.(example.sum = sum(numbers))),
by=id]
Or if you have a vector of column names:
cols <- c("sample1","sample2")
example_summary1 <- example[, c(lapply(mget(cols), mean),
.(example.sum = sum(numbers))),
by=id]
But I suspect that you don't get the same data.table optimizations then.
Finally, a data.table join is so fast that I would use your approach.

Resources