How to format describeBy table in R? - r

I have this data set:
Defects.I Defects.D Treatment
1 2 A
1 3 B
And I'm trying to do a descriptive statistics for defects detected and isolated, grouped per treatment.
After searching for a while I found a nice function on the psych library called describeBy().
With the following code:
describeBy(myData[1:2],myData$Treatment)
I got this output:
Treatment A
Mean. Median. Trimed.
Defects.I x x x
Defects.D x x x
Treatment B
Mean. Median. Trimed.
Defects.I x x x
Defects.D x x x
But in reality I was looking for something like
Mean. Median. Trimed.
A B A B A B
Defects.I x x x x x x
Defects.D x x x x x x
Data
myData <- structure(list(Defects.I = c(1L, 1L), Defects.D = 2:3, Treatment = c("A",
"B")), .Names = c("Defects.I", "Defects.D", "Treatment"), class = "data.frame", row.names = c(NA,
-2L))

Since describeBy returns a lists of data frames, we could just cbind them all, but that doesn't get the right order. Instead we can interleave the columns
myData <- structure(list(Defects.I = c(1L, 1L), Defects.D = 2:3,
Treatment = c("A", "B")),
.Names = c("Defects.I", "Defects.D", "Treatment"),
class = "data.frame", row.names = c(NA, -2L))
l <- psych::describeBy(myData[1:2], myData$Treatment)
So interleave using this order
order(sequence(c(ncol(l$A), ncol(l$B))))
# [1] 1 14 2 15 3 16 4 17 5 18 6 19 7 20 8 21 9 22 10 23 11 24 12 25 13 26
rather than what cbind alone would do
c(1:13, 1:13)
# [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 1 2 3 4 5 6 7 8 9 10 11 12 13
so this
do.call('cbind', l)[, order(sequence(lengths(l)))]
# A.vars B.vars A.n B.n A.mean B.mean A.sd B.sd A.median B.median A.trimmed B.trimmed A.mad B.mad
# Defects.I 1 1 1 1 1 1 NA NA 1 1 1 1 0 0
# Defects.D 2 2 1 1 2 3 NA NA 2 3 2 3 0 0
# A.min B.min A.max B.max A.range B.range A.skew B.skew A.kurtosis B.kurtosis A.se B.se
# Defects.I 1 1 1 1 0 0 NA NA NA NA NA NA
# Defects.D 2 3 2 3 0 0 NA NA NA NA NA NA
or as a function
interleave <- function(l, how = c('cbind', 'rbind')) {
how <- match.arg(how)
if (how %in% 'rbind')
do.call(how, l)[order(sequence(sapply(l, nrow))), ]
else do.call(how, l)[, order(sequence(sapply(l, ncol))), ]
}
interleave(l)
# A.vars B.vars A.n B.n
# Defects.I 1 1 1 1
# Defects.D 2 2 1 1 ...
# ...
interleave(l, 'r')
# vars n mean sd median trimmed mad min max range skew kurtosis se
# A.Defects.I 1 1 1 NA 1 1 0 1 1 0 NA NA NA
# B.Defects.I 1 1 1 NA 1 1 0 1 1 0 NA NA NA
# A.Defects.D 2 1 2 NA 2 2 0 2 2 0 NA NA NA
# B.Defects.D 2 1 3 NA 3 3 0 3 3 0 NA NA NA

You can try the mat = TRUE argument. It's not exactly what you're looking for, but it's closer:
library(psych)
mydata = data.frame(Defects.I = c(1,1), Defects.D = c(2,3), Treatment = c('A','B'))
describeBy(mydata[1:2], mydata$Treatment, mat = TRUE)
gives
item group1 vars n mean sd median trimmed mad min max range skew kurtosis se
Defects.I1 1 A 1 1 1 NA 1 1 0 1 1 0 NA NA NA
Defects.I2 2 B 1 1 1 NA 1 1 0 1 1 0 NA NA NA
Defects.D1 3 A 2 1 2 NA 2 2 0 2 2 0 NA NA NA
Defects.D2 4 B 2 1 3 NA 3 3 0 3 3 0 NA NA NA

Related

New variable that indicates the first occurrence of a specific value

I want to create a new variable that indicates the first specific observation of a value for a variable.
In the following example dataset I want to have a new variable "firstna" that is "1" for the first observation of "NA" for this player.
game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data
player level points
1 1 1 20
2 1 2 NA
3 1 3 NA
4 1 4 NA
5 2 1 20
6 2 2 40
7 2 3 NA
8 2 4 NA
The resulting dataframe should look like this:
game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))
game_data_new
player level points firstna
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
To be honest i don't know how to do this. It would be perfect if there is a dplyr option to do so.
A base R solution:
ave(game_data$points, game_data$player,
FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))
Another ave option to find out first NA by group (player).
game_data$firstna <- ave(game_data$points, game_data$player,
FUN = function(x) cumsum(is.na(x)) == 1)
game_data
# player level points firstna
#1 1 1 20 0
#2 1 2 NA 1
#3 1 3 NA 0
#4 1 4 NA 0
#5 2 1 20 0
#6 2 2 40 0
#7 2 3 NA 1
#8 2 4 NA 0
Here is a solution with data.table:
library("data.table")
game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# player level points firstna
# 1: 1 1 20 FALSE
# 2: 1 2 NA TRUE
# 3: 1 3 NA FALSE
# 4: 1 4 NA FALSE
# 5: 2 1 20 FALSE
# 6: 2 2 40 FALSE
# 7: 2 3 NA TRUE
# 8: 2 4 NA FALSE
You can do this by grouping by player and then mutating to check if a row has an NA value and the previous row doesn't
game_data %>%
group_by(player) %>%
mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
ungroup()
Result:
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
library(tidyverse)
library(data.table)
data.frame(
player = c(1,1,1,1,2,2,2,2),
level = c(1,2,3,4,1,2,3,4),
points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data
game_data_base1 <- game_data
game_data_dt <- data.table(game_data)
microbenchmark::microbenchmark(
better_base = game_data$first_na <- ave(
game_data$points,
game_data$player,
FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
),
brute_base = do.call(
rbind.data.frame,
lapply(
split(game_data, game_data$player),
function(x) {
x$firstna <- 0
na_loc <- which(is.na(x$points))
if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
x
}
)
),
tidy = game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
ungroup(),
dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]
)
## Unit: microseconds
## expr min lq mean median uq max neval
## better_base 125.188 156.861 362.9829 191.6385 355.6675 3095.958 100
## brute_base 366.642 450.002 2782.6621 658.0380 1072.6475 174373.974 100
## tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350 42406.778 100
## dt 330.428 421.211 1031.9978 535.8415 1042.1240 9671.991 100
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))
Group by player, then create a boolean vector for cases that are both NA and not duplicates for previous rows.
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
If you want the 1s on the last non-NA line before an NA, replace the mutate line with this:
mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))
First row of a block of NAs that runs all the way to the end of the player's group:
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))
Another way using base:
game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)
or as another ?ave clone:
ave(game_data$points, game_data$player, FUN = function(x) {
i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})
An option using diff
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
# player level points firstna
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0
And its dplyr equivalent:
library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups: player [2]
# player level points firstna
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0

Applying custom function to each row uses only first value of argument

I am trying to recode NA values to 0 in a subset of columns using the following dataset:
set.seed(1)
df <- data.frame(
id = c(1:10),
trials = sample(1:3, 10, replace = T),
t1 = c(sample(c(1:9, NA), 10)),
t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
t3 = c(sample(c(1:5, rep(NA, 5)), 10))
)
Each row has a certain number of trials associated with it (between 1-3), specified by the trials column. columns t1-t3 represent scores for each trial.
The number of trials indicates the subset of columns in which NAs should be recoded to 0: NAs that are within the number of trials represent missing data, and should be recoded as 0, while NAs outside the number of trials are not meaningful, and should remain NAs. So, for a row where trials == 3, an NA in column t3 would be recoded as 0, but in a row where trials == 2, an NA in t3 would remain an NA.
So, I tried using this function:
replace0 <- function(x, num.sun) {
x[which(is.na(x[1:(num.sun + 2)]))] <- 0
return(x)
}
This works well for single vectors. When I try applying the same function to a data frame with apply(), though:
apply(df, 1, replace0, num.sun = df$trials)
I get a warning saying:
In 1:(num.sun + 2) :
numerical expression has 10 elements: only the first used
The result is that instead of having the value of num.sun change every row according to the value in trials, apply() simply uses the first value in the trials column for every single row. How could I apply the function so that the num.sun argument changes according to the value of df$trials?
Thanks!
Edit: as some have commented, the original example data had some non-NA scores that didn't make sense according to the trials column. Here's a corrected dataset:
df <- data.frame(
id = c(1:5),
trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
t1 = c(NA, 7, NA, 6, NA),
t2 = c(NA, NA, 3, 7, 12),
t3 = c(NA, NA, NA, 4, NA)
)
Another approach:
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)
# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
# use 'i' to replace the allowed NA's with a zero
df[i] <- 0
which gives:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
You could easily wrap this in a function:
replace.NA.with.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
df[i] <- 0
return(df)
}
Now, using replace.NA.with.0(df) will produce the above result.
As noted by others, some rows (1, 3 & 10) have more values than trails. You could tackle that problem by rewriting the above function to:
replace.with.NA.or.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
df[w] <- 0
v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
df[ina] <- NA
return(df)
}
Now, using replace.with.NA.or.0(df) produces the following result:
id trials t1 t2 t3
1 1 1 3 NA NA
2 2 2 2 2 NA
3 3 2 6 6 NA
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 NA
9 9 2 1 3 NA
10 10 1 9 NA NA
Here I just rewrite your function using double subsetting x[paste0('t',x['trials'])], which overcome the problem in the other two solutions with row 6
replace0 <- function(x){
#browser()
x_na <- x[paste0('t',x['trials'])]
if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
return(x)
}
t(apply(df, 1, replace0))
id trials t1 t2 t3
[1,] 1 1 3 NA 5
[2,] 2 2 2 2 NA
[3,] 3 2 6 6 4
[4,] 4 3 NA 1 2
[5,] 5 1 5 NA NA
[6,] 6 3 7 NA 0
[7,] 7 3 8 7 0
[8,] 8 2 4 5 1
[9,] 9 2 1 3 NA
[10,] 10 1 9 4 3
Here is a way to do it:
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
The output looks like this:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
> x <- is.na(df)
> df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
Note: row 1/3/10, is problematic since there are more non-NA values than the trials.
Here's a tidyverse way, note that it doesn't give the same output as other solutions.
Your example data shows results for trials that "didn't happen", I assumed your real data doesn't.
library(tidyverse)
df %>%
nest(matches("^t\\d")) %>%
mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
unnest
# id trials t1 t2 t3
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
Using the more commonly used gather strategy this would be:
df %>%
gather(k,v,matches("^t\\d")) %>%
arrange(id) %>%
group_by(id) %>%
slice(1:first(trials)) %>%
mutate_at("v",~replace(.,is.na(.),0)) %>%
spread(k,v)
# # A tibble: 10 x 5
# # Groups: id [10]
# id trials t1 t2 t3
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA

Cross Tabulation in R Dataframe

I have a dataframe in R:
Subject T O E P Score
1 0 1 0 1 256
2 1 0 1 0 325
2 0 1 0 1 125
3 0 1 0 1 27
4 0 0 0 1 87
5 0 1 0 1 125
6 0 1 1 1 100
This is just a display of the dataframe. In reality, I have a lot of lines for each of the subjects. But the subjects are only from 1 to 6
For each Subject, the possible values are:
T : 0 or 1
O : 0 or 1
E : 0 or 1
P : 0 or 1
Score : Numeric value
I want to create a new dataframe with 6 lines (one for each subject) and the calculated MEAN score for each of these combinations :
T , O , E , P , TO , TE, TP, OE , OP , PE , TOP , TOE , POE , PET
The above will the columns of the new dataframe.
The final output should look like this
Subject T O E P TO TE TP OE OP PE TOP TOE POE PET
1
2
3
4
5
6
For each of these lines x columns the value is the MEAN SCORE
I tried aggregate and table but I can't seem to get what I want
Sorry I am new to R
Thanks
I had to rebuild sample data to answer the question as I understood it, tell me if it works for you :
set.seed(2)
df <- data.frame(subject=sample(1:3,9,T),
T = sample(c(0,1),9,T),
O = sample(c(0,1),9,T),
E = sample(c(0,1),9,T),
P = sample(c(0,1),9,T),
score=round(rnorm(9,10,3)))
# subject T O E P score
# 1 1 1 0 0 1 12
# 2 3 1 0 1 0 9
# 3 2 0 1 0 1 13
# 4 1 1 0 0 0 3
# 5 3 0 1 0 1 14
# 6 3 0 0 1 0 13
# 7 1 1 0 1 0 17
# 8 3 1 0 1 0 12
# 9 2 0 0 1 1 14
cols1 <- c("T","O","E","P")
df$comb <- apply(df[cols1],1,function(x) paste(names(df[cols1])[as.logical(x)],collapse=""))
# subject T O E P score comb
# 1 1 1 0 0 1 12 TP
# 2 3 1 0 1 0 9 TE
# 3 2 0 1 0 1 13 OP
# 4 1 1 0 0 0 3 T
# 5 3 0 1 0 1 14 OP
# 6 3 0 0 1 0 13 E
# 7 1 1 0 1 0 17 TE
# 8 3 1 0 1 0 12 TE
# 9 2 0 0 1 1 14 EP
library(tidyverse)
df %>%
group_by(subject,comb) %>%
summarize(score=mean(score)) %>%
spread(comb,score) %>%
ungroup
# # A tibble: 3 x 7
# subject E EP OP T TE TP
# * <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 NA NA NA 3 17.0 12
# 2 2 NA 14 13 NA NA NA
# 3 3 13 NA 14 NA 10.5 NA
The second step in base R:
means <- aggregate(score ~ subject + comb,df,mean)
means2 <- reshape(means,timevar="comb",idvar="subject",direction="wide")
setNames(means2,c("subject",sort(unique(df$comb))))
# subject E EP OP T TE TP
# 1 3 13 NA 14 NA 10.5 NA
# 2 2 NA 14 13 NA NA NA
# 5 1 NA NA NA 3 17.0 12
I'd do it like this:
# using your table data
df = read.table(text =
"Subject T O E P Score
1 0 1 0 1 256
2 1 0 1 0 325
2 0 1 0 1 125
3 0 1 0 1 27
4 0 0 0 1 87
5 0 1 0 1 125
6 0 1 1 1 100", stringsAsFactors = FALSE, header=TRUE)
# your desired column names
new_names <- c("T", "O", "E", "P", "TO", "TE", "TP", "OE",
"OP", "PE", "TOP", "TOE", "POE", "PET")
# assigning each of your scores to one of the desired column names
assign_comb <- function(dfrow) {
selection <- c("T", "O", "E", "P")[as.logical(dfrow[2:5])]
do.call(paste, as.list(c(selection, sep = "")))
}
df$comb <- apply(df, 1, assign_comb)
# aggregate all the means together
df_agg <- aggregate(df$Score ~ df$comb + df$Subject, FUN = mean)
# reshape the data to wide format
df_new <- reshape(df_agg, v.names = "df$Score", idvar = "df$Subject",
timevar = "df$comb", direction = "wide")
# clean up the column names to match your desired output
# any column names not found will be added as NA
colnames(df_new) <- gsub("df\\$|Score\\.", "", colnames(df_new))
df_new[, new_names[!new_names %in% colnames(df_new)]] <- NA
df_new <- df_new[, c("Subject", new_names)]
With the result:
> df_new
Subject T O E P TO TE TP OE OP PE TOP TOE POE PET
1 1 NA NA NA NA NA NA NA NA 256 NA NA NA NA NA
2 2 NA NA NA NA NA 325 NA NA 125 NA NA NA NA NA
4 3 NA NA NA NA NA NA NA NA 27 NA NA NA NA NA
5 4 NA NA NA 87 NA NA NA NA NA NA NA NA NA NA
6 5 NA NA NA NA NA NA NA NA 125 NA NA NA NA NA
7 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Assign values to matrix with index of other data frame

My dataframe, D is like this.
D$fit has both distance (0:6) and dg (1:3) info
D <- read.table(header = TRUE, text = "
distance dg fit
1 0 1 A
2 1 1 B
3 2 1 C
4 3 1 D
5 4 1 E
6 5 1 F
7 6 1 G
8 0 2 H
9 1 2 I
10 2 2 J
11 3 2 K
12 4 2 L
13 5 2 M
14 0 3 O
15 1 3 P
16 2 3 Q
17 3 3 R
")
I want to assign fit values to this matrix, md, corresponding to distance and dg.
md <- matrix(1:21, nrow = 7)
colnames(md) <- c(1:3)
rownames(md) <- c(0:6)
md[] <- NA
1 2 3
0 NA NA NA
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 NA NA NA
5 NA NA NA
6 NA NA NA
I've tried but failed with this code
cmd = expand.grid(i=seq(0,6), j = seq(1,3))
i <- seq(0,6)
j <- seq(1,3)
md[i,j] <- D$fit[D$distance == cmd[1] & D$dg == cmd[2]]
We can use acast from library(reshape2)
library(reshape2)
acast(D, distance~dg, value.var="fit")
Or with reshape from base R
reshape(D, idvar="distance", timevar="dg", direction="wide")

R: How to calculate lag for multiple columns by group for data table

I would like to calculate the diff of variables in a data table, grouped by id. Here is some sample data. The data is recorded at a sample rate of 1 Hz. I would like to estimate the first and second derivatives (speed, acceleration)
df <- read.table(text='x y id
1 2 1
2 4 1
3 5 1
1 8 2
5 2 2
6 3 2',header=TRUE)
dt<-data.table(df)
Expected output
# dx dy id
# NA NA 1
# 1 2 1
# 1 1 1
# NA NA 2
# 4 -6 2
# 1 1 2
Here's what I've tried
dx_dt<-dt[, diff:=c(NA,diff(dt[,'x',with=FALSE])),by = id]
Output is
Error in `[.data.frame`(dt, , `:=`(diff, c(NA, diff(dt[, "x", with = FALSE]))), :
unused argument (by = id)
As pointed out by Akrun, the 'speed' terms (dx, dy) can be obtained using either data table or plyr. However, I'm unable to understand the calculation well enough to extend it to acceleration terms. So, how to calculate the 2nd lag terms?
dt[, c('dx', 'dy'):=lapply(.SD, function(x) c(NA, diff(x))),
+ by=id]
produces
x y id dx dy
1: 1 2 1 NA NA
2: 2 4 1 1 2
3: 3 5 1 1 1
4: 1 8 2 NA NA
5: 5 2 2 4 -6
6: 6 3 2 1 1
How to expand to get a second diff, or the diff of dx, dy?
x y id dx dy dx2 dy2
1: 1 2 1 NA NA NA NA
2: 2 4 1 1 2 NA NA
3: 3 5 1 1 1 0 -1
4: 1 8 2 NA NA NA NA
5: 5 2 2 4 -6 NA NA
6: 6 3 2 1 1 -3 7
You can try
setnames(dt[, lapply(.SD, function(x) c(NA,diff(x))), by=id],
2:3, c('dx', 'dy'))[]
# id dx dy
#1: 1 NA NA
#2: 1 1 2
#3: 1 1 1
#4: 2 NA NA
#5: 2 4 -6
#6: 2 1 1
Another option would be to use dplyr
library(dplyr)
df %>%
group_by(id) %>%
mutate_each(funs(c(NA,diff(.))))%>%
rename(dx=x, dy=y)
Update
You can repeat the step twice
dt[, c('dx', 'dy'):=lapply(.SD, function(x) c(NA, diff(x))), by=id]
dt[,c('dx2', 'dy2'):= lapply(.SD, function(x) c(NA, diff(x))),
by=id, .SDcols=4:5]
dt
# x y id dx dy dx2 dy2
#1: 1 2 1 NA NA NA NA
#2: 2 4 1 1 2 NA NA
#3: 3 5 1 1 1 0 -1
#4: 1 8 2 NA NA NA NA
#5: 5 2 2 4 -6 NA NA
#6: 6 3 2 1 1 -3 7
Or we can use the shift function from data.table
dt[, paste0("d", c("x", "y")) := .SD - shift(.SD), by = id
][, paste0("d", c("x2", "y2")) := .SD - shift(.SD) , by = id, .SDcols = 4:5 ]

Resources