I have a list of points a1, a2, b1 and b2 and a set of reference points. I would like to figure out the distance between the four points and all of the reference points. I can do this no problem at all using the following example
library(dplyr)
x <- rep(1:5, 5)
y <- rep(1:5, each = 5)
a1.x <- c(4.5)
a1.y <- c(6)
a2.x <- c(0.8)
a2.y <- c(3.2)
b1.x <- c(2.5)
b1.y <- c(5)
b2.x <- c(3.8)
b2.y <- c(1.5)
time <- 1
time1 <- as.data.frame(cbind(time,x,y,a1.x,a1.y,a2.x,a2.y,b1.x,b1.y,b2.x,b2.y))
x <- rep(1:5, 5)
y <- rep(1:5, each = 5)
a1.x <- c(4)
a1.y <- c(5)
a2.x <- c(1.5)
a2.y <- c(3.9)
b1.x <- c(1.4)
b1.y <- c(4.6)
b2.x <- c(6)
b2.y <- c(5.2)
time <- 2
time2 <- as.data.frame(cbind(time,x,y,a1.x,a1.y,a2.x,a2.y,b1.x,b1.y,b2.x,b2.y))
df <- rbind(time1,time2)
df <- df %>%
mutate(dista1 = sqrt((x-a1.x)^2 + (y-a1.y)^2)) %>%
mutate(dista2 = sqrt((x-a2.x)^2 + (y-a2.y)^2)) %>%
mutate(distb1 = sqrt((x-b1.x)^2 + (y-b1.y)^2)) %>%
mutate(distb2 = sqrt((x-b2.x)^2 + (y-b2.y)^2))
This works no problem and is fairly fast. However on a larger data set it is slowed down by typing out all columns or having to rely on for loops. What is the most efficient way to perform the above?
Edit - additionally, I have a time factor as well
Using the rdist function in package fields is simpler:
library(fields) #use install.packages("fields") first
pts <- cbind(x= c(a1.x, a2.x, b1.x, b2.x), y=c(a1.y, a2.y, b1.y, b2.y))
ref <- cbind(x, y)
distances <- rdist(ref, pts)
colnames(distances) <- c("dista1", "dista2", "distb1", "distb2")
head(distances)
# dista1 dista2 distb1 distb2
# [1,] 6.103278 2.209072 4.272002 2.8442925
# [2,] 5.590170 2.505993 4.031129 1.8681542
# [3,] 5.220153 3.111270 4.031129 0.9433981
# [4,] 5.024938 3.883298 4.272002 0.5385165
# [5,] 5.024938 4.741308 4.716991 1.3000000
# [6,] 5.315073 1.216553 3.354102 2.8442925
If you want to match df in your example:
df <- cbind(ref, a1.x, a1.y, a2.x, a2.y, b1.x, b1.y, b2.x, b2.y, distances)
head(df)
# x y a1.x a1.y a2.x a2.y b1.x b1.y b2.x b2.y dista1 dista2 distb1 distb2
# [1,] 1 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 6.103278 2.209072 4.272002 2.8442925
# [2,] 2 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.590170 2.505993 4.031129 1.8681542
# [3,] 3 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.220153 3.111270 4.031129 0.9433981
# [4,] 4 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.024938 3.883298 4.272002 0.5385165
# [5,] 5 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.024938 4.741308 4.716991 1.3000000
# [6,] 1 2 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.315073 1.216553 3.354102 2.8442925
If there are multiple times, this approach can be extended. First extracting from your time1 and time2 objects to create multiple points and reference matrices:
time1.pts <- matrix(unlist(time1[1, 4:11]), 4, 2, byrow=TRUE)
time2.pts <- matrix(unlist(time2[1, 4:11]), 4, 2, byrow=TRUE)
ref1 <- matrix(unlist(time1[1, 2:3]), 4, 2, byrow=TRUE)
ref2 <- matrix(unlist(time2[1, 2:3]), 4, 2, byrow=TRUE)
ref <- list(ref1=ref1, ref2=ref2)
pts <- list(time1.pts=time1.pts, time2.pts=time2.pts)
Matrices are faster to process than data frames so this should be faster than working with data frames. Now the analysis:
results <- lapply(seq(ntimes), function(i) rdist(ref[[i]], pts[[i]]))
distances <- do.call(rbind, results)
colnames(distances) <- c("dista1", "dista2", "distb1", "distb2")
The distances matrix contains all of the distances. Now we just combine them with your df:
df <- data.frame(df, distances)
options(digits=4)
head(df, 5); cat(". . . . .\n"); tail(df, 5)
# time x y a1.x a1.y a2.x a2.y b1.x b1.y b2.x b2.y dista1 dista2 distb1 distb2
# 1 1 1 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 6.103 2.209 4.272 2.8443
# 2 1 2 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.590 2.506 4.031 1.8682
# 3 1 3 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.220 3.111 4.031 0.9434
# 4 1 4 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.025 3.883 4.272 0.5385
# 5 1 5 1 4.5 6 0.8 3.2 2.5 5 3.8 1.5 5.025 4.741 4.717 1.3000
# . . . . .
# time x y a1.x a1.y a2.x a2.y b1.x b1.y b2.x b2.y dista1 dista2 distb1 distb2
# 46 2 1 5 4 5 1.5 3.9 1.4 4.6 6 5.2 3 1.208 0.5657 5.004
# 47 2 2 5 4 5 1.5 3.9 1.4 4.6 6 5.2 2 1.208 0.7211 4.005
# 48 2 3 5 4 5 1.5 3.9 1.4 4.6 6 5.2 1 1.860 1.6492 3.007
# 49 2 4 5 4 5 1.5 3.9 1.4 4.6 6 5.2 0 2.731 2.6306 2.010
# 50 2 5 5 4 5 1.5 3.9 1.4 4.6 6 5.2 1 3.669 3.6222 1.020
I would suggest base-R using
referencePointList <- list(a1 = c(4.5,6), a2 = c(0.8,3.2), b1 = c(2.5,5),b2 = c(3.8,1.5))
distanceDfToReferencePoints <- function(x,y,referencePointList) {
distDf <- setNames(data.frame(do.call("cbind", lapply(referencePointList, function(rp) {
sqrt((x-rp[1])^2+(y-rp[2])^2)
}))), paste0("dist_",names(referencePointList)))
cbind(data.frame(x=x,y=y),distDf)
}
Lets put your method into a function say
f0 <- function() {
df <- as.data.frame(cbind(x,y,a1.x,a1.y,a2.x,a2.y,b1.x,b1.y,b2.x,b2.y))
df %>%
mutate(dista1 = sqrt((x-a1.x)^2 + (y-a1.y)^2)) %>%
mutate(dista2 = sqrt((x-a2.x)^2 + (y-a2.y)^2)) %>%
mutate(distb1 = sqrt((x-b1.x)^2 + (y-b1.y)^2)) %>%
mutate(distb2 = sqrt((x-b2.x)^2 + (y-b2.y)^2))
}
and do a benchmark:
microbenchmark::microbenchmark(distanceDfToReferencePoints(x,y,referencePointList),f0())
giving on my machine
Unit: microseconds
expr min lq mean median uq max neval
distanceDfToReferencePoints(x, y, referencePointList) 302.3 340.90 543.882 411.15 479.8 11869.5 100
f0() 6074.3 6557.85 7497.018 6994.55 7591.5 28291.7 100
Hence the base-R-solution seems to be much faster. Maybe that already helps you.
a data.table approach
library(data.table)
setDT(df)
#get names of points
refpoints <- unique( sub( "(^.*)\\.[xy]", "\\1", names( df[, -c(1,2)] ) ) )
#melt
DT <- melt(df, id.vars = c("x","y"), measure.vars = patterns(x2 = ".*\\.x", y2 = ".*\\.y"))
#set points' names
setattr(DT$variable, "levels", refpoints )
#calculate distance
DT[, distance := sqrt((x-x2)^2 + (y-y2)^2)]
#cast to wide again
dcast(DT, x + y ~ paste0("dist_", variable), value.var = "distance")
# x y dist_a1 dist_a2 dist_b1 dist_b2
# 1: 1 1 6.103278 2.2090722 4.272002 2.8442925
# 2: 1 2 5.315073 1.2165525 3.354102 2.8442925
# 3: 1 3 4.609772 0.2828427 2.500000 3.1764760
# 4: 1 4 4.031129 0.8246211 1.802776 3.7536649
# 5: 1 5 3.640055 1.8110770 1.500000 4.4821870
# 6: 2 1 5.590170 2.5059928 4.031129 1.8681542
# 7: 2 2 4.716991 1.6970563 3.041381 1.8681542
# 8: 2 3 3.905125 1.2165525 2.061553 2.3430749
# 9: 2 4 3.201562 1.4422205 1.118034 3.0805844
#10: 2 5 2.692582 2.1633308 0.500000 3.9357337
#11: 3 1 5.220153 3.1112698 4.031129 0.9433981
#12: 3 2 4.272002 2.5059928 3.041381 0.9433981
#13: 3 3 3.354102 2.2090722 2.061553 1.7000000
#14: 3 4 2.500000 2.3409400 1.118034 2.6248809
#15: 3 5 1.802776 2.8425341 0.500000 3.5902646
#16: 4 1 5.024938 3.8832976 4.272002 0.5385165
#17: 4 2 4.031129 3.4176015 3.354102 0.5385165
#18: 4 3 3.041381 3.2062439 2.500000 1.5132746
#19: 4 4 2.061553 3.2984845 1.802776 2.5079872
#20: 4 5 1.118034 3.6715120 1.500000 3.5057096
#21: 5 1 5.024938 4.7413078 4.716991 1.3000000
#22: 5 2 4.031129 4.3680659 3.905125 1.3000000
#23: 5 3 3.041381 4.2047592 3.201562 1.9209373
#24: 5 4 2.061553 4.2755117 2.692582 2.7730849
#25: 5 5 1.118034 4.5694639 2.500000 3.7000000
# x y dist_a1 dist_a2 dist_b1 dist_b2
My suggestion is to use Rfast::dista() and see the time for yourself.
using tidyverse:
df %>%
rename(x_new = x, y_new = y)%>%
pivot_longer(3:ncol(df), names_pattern ="(\\w+)\\.(\\w+)",
names_to = c('var', '.value')) %>%
mutate(value = sqrt((x-x_new)^2 + (y-y_new)^2)) %>%
pivot_wider(c(x_new,y_new),var, values_from = value)
x_new y_new a1 a2 b1 b2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 6.10 2.21 4.27 2.84
2 2 1 5.59 2.51 4.03 1.87
3 3 1 5.22 3.11 4.03 0.943
4 4 1 5.02 3.88 4.27 0.539
5 5 1 5.02 4.74 4.72 1.3
6 1 2 5.32 1.22 3.35 2.84
7 2 2 4.72 1.70 3.04 1.87
8 3 2 4.27 2.51 3.04 0.943
9 4 2 4.03 3.42 3.35 0.539
10 5 2 4.03 4.37 3.91 1.3
# ... with 15 more rows
If at all you could extract the columns by hand, they you could even make this faster:
pts <- cbind(x= c(a1.x, a2.x, b1.x, b2.x), y=c(a1.y, a2.y, b1.y, b2.y))
ref <- cbind(x, y)
sqrt(laGP::distance(ref, pts))
[,1] [,2] [,3] [,4]
[1,] 6.103278 2.2090722 4.272002 2.8442925
[2,] 5.590170 2.5059928 4.031129 1.8681542
[3,] 5.220153 3.1112698 4.031129 0.9433981
[4,] 5.024938 3.8832976 4.272002 0.5385165
[5,] 5.024938 4.7413078 4.716991 1.3000000
[6,] 5.315073 1.2165525 3.354102 2.8442925
[7,] 4.716991 1.6970563 3.041381 1.8681542
[8,] 4.272002 2.5059928 3.041381 0.9433981
or even:
apply(pts, 1, function(x)sqrt(rowSums((x-ref)^2)))
[,1] [,2] [,3] [,4]
[1,] 6.103278 2.2090722 4.272002 2.8442925
[2,] 5.315073 1.2165525 3.354102 2.8442925
[3,] 5.220153 3.1112698 4.031129 0.9433981
[4,] 4.031129 0.8246211 1.802776 3.7536649
[5,] 5.024938 4.7413078 4.716991 1.3000000
[6,] 5.590170 2.5059928 4.031129 1.8681542
[7,] 4.716991 1.6970563 3.041381 1.8681542
[8,] 3.905125 1.2165525 2.061553 2.3430749
If you are using R >= 4.1
df |>
reshape(matrix(3:ncol(df), 2), dir='long') |>
transform(new_pt = sqrt((x-a1.x)^2 + (y - a1.y)^2)) |>
reshape(v.names = c('a1.x','a1.y','new_pt'),dir='wide')
If not, change the |> pipe to %>%
Related
My data :
data <- c(1,5,11,15,24,31,32,65)
There are 2 neighbours: 31 and 32. I wish to remove them and keep only the mean value (e.g. 31.5), in such a way data would be :
data <- c(1,5,11,15,24,31.5,65)
It seems simple, but I wish to do it automatically, and sometimes with vectors containing more neighbours. For instance :
data_2 <- c(1,5,11,15,24,31,32,65,99,100,101,140)
Here is another idea that creates an id via cumsum(c(TRUE, diff(a) > 1)), where 1 shows the gap threshold, i.e.
#our group variable
grp <- cumsum(c(TRUE, diff(a) > 1))
#keep only groups with length 1 (i.e. with no neighbor)
i1 <- a[!!!ave(a, grp, FUN = function(i) length(i) > 1)]
#Find the mean of the groups with more than 1 rows,
i2 <- unname(tapply(a, grp, function(i)mean(i[length(i) > 1])))
#Concatenate the above 2 (eliminating NAs from i2) to get final result
c(i1, i2[!is.na(i2)])
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 31.5
You can also wrap it in a function. I left the gap as a parameter so you can adjust,
get_vec <- function(x, gap) {
grp <- cumsum(c(TRUE, diff(x) > gap))
i1 <- x[!!!ave(x, grp, FUN = function(i) length(i) > 1)]
i2 <- unname(tapply(x, grp, function(i) mean(i[length(i) > 1])))
return(c(i1, i2[!is.na(i2)]))
}
get_vec(a, 1)
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 31.5
get_vec(a_2, 1)
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 140.0 31.5 100.0
DATA:
a <- c(1,5,11,15,24,31,32,65)
a_2 <- c(1, 5, 11, 15, 24, 31, 32, 65, 99, 100, 101, 140)
Here is my solution, which uses run-length encoding to identify groups:
foo <- function(x) {
y <- x - seq_along(x) #normalize to zero differences in groups
ind <- rle(y) #run-length encoding
ind$values <- ind$lengths != 1 #to find groups
ind$values[ind$values] <- cumsum(ind$values[ind$values]) #group ids
ind <- inverse.rle(ind)
xnew <- x
xnew[ind != 0] <- ave(x, ind, FUN = mean)[ind != 0] #calculate means
xnew[!(duplicated(ind) & ind != 0)] #remove duplicates from groups
}
foo(data)
#[1] 1.0 5.0 11.0 15.0 24.0 31.5 65.0
foo(data_2)
#[1] 1.0 5.0 11.0 15.0 24.0 31.5 65.0 100.0 140.0
data_3 <- c(1, 2, 4, 1, 2)
foo(data_3)
#[1] 1.5 4.0 1.5
I assume that you don't need an extremely efficient solution. If you do, I'd recommend a simple C++ for loop in Rcpp.
I have a data.table based solution, same could be translated into dplyr I guess:
library(data.table)
df <- data.table(data2 = c(1,5,11,15,24,31,32,65,99,100,101,140))
df[,neighbours := ifelse(c(0,diff(data_2)) == 1,1,0)]
df[,neighbours := c(neighbours[1:(.N-1)],1),by = rleid(neighbours)]
df[,neigh_seq := rleid(neighbours)]
unique(df[,ifelse(neighbours == 1,mean(data2),data2),by = neigh_seq])
neigh_seq V1
1: 1 1.0
2: 1 5.0
3: 1 11.0
4: 1 15.0
5: 1 24.0
6: 2 31.5
7: 3 65.0
8: 4 100.0
9: 5 140.0
What it does :
first line set neigbours to 1 if the difference with following number is 1
1: 1 0
2: 5 0
3: 11 0
4: 15 0
5: 24 0
6: 31 0
7: 32 1
8: 65 0
9: 99 0
10: 100 1
11: 101 1
12: 140 0
I wanr to group so that neighbour variable is 1 for all neigbours. I need to add 1 to each end of each groups:
df[,neighbours := c(neighbours[1:(.N-1)],1),by = rleid(neighbours)]
data2 neighbours
1: 1 0
2: 5 0
3: 11 0
4: 15 0
5: 24 0
6: 31 1
7: 32 1
8: 65 0
9: 99 1
10: 100 1
11: 101 1
12: 140 0
Then after I just do a grouping on changing neighbour value, and set the value to mean if they are neihbours
df[,ifelse(neighbours == 1,mean(data2),data2),by = rleid(neighbours)]
rleid V1
1: 1 1.0
2: 1 5.0
3: 1 11.0
4: 1 15.0
5: 1 24.0
6: 2 31.5
7: 2 31.5
8: 3 65.0
9: 4 100.0
10: 4 100.0
11: 4 100.0
12: 5 140.0
and take the unique values. And voila.
This is a dplyr version, also using as a grouping variable cumsum(c(1,diff(x)!=1)):
library(dplyr)
data_2 %>% data.frame(x = .) %>%
group_by(id = cumsum(c(1,diff(x)!=1))) %>%
summarise(res = mean(x)) %>%
select(res)
# A tibble: 9 x 1
res
<dbl>
1 1.0
2 5.0
3 11.0
4 15.0
5 24.0
6 31.5
7 65.0
8 100.0
9 140.0
I am trying to fill blanks in var1 with the mean of var2 and var3, but I can't get it to work. This is what I've tried so far:
df <- data.frame(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3))
df$var2 <- c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6)
df$var3 <- c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)
for(i in 1:length(df$var1)) {
ifelse(is.na(df$var1[i]), df$var1[i] <- mean(df$var2[i], df$var3[i]), df$var1[i] == df$var1[i])
}
I am not sure what I am doing wrong. After running the code, var1 still shows empty cells.
Thank you very much for your help
Try this:
df <- data.frame(var1 = c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
stringsAsFactors = FALSE)
df[df==""] <- "NA"
for (i in 1:length(df$var1)) {
if (df$var1[i]== "NA") {
df$var1[i] = rowMeans(df[i, 2:3])
} else {
df$var1[i] = df$var1[i]
}
}
Or:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "NA", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Alternatively, instead of redefining blanks as "NA" (as text in the example above), you could leave it as blank, skiping the df[df==""] <- "NA" bit:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Or identify blanks as "real" NAs:
df[df==""] <- NA
for (i in 1:length(df[,1])) {
ifelse (is.na(df[i,1]), df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Another way without any loops:
library(dplyr)
df %>%
mutate_at(vars(var1:var3), as.numeric) %>%
mutate(var1 = case_when(is.na(var1) ~ (var2+var3)/2, TRUE ~ var1))
#> var1 var2 var3
#> 1 1.0 1 4
#> 2 2.0 8 1
#> 3 5.0 9 1
#> 4 3.0 1 4
#> 5 3.0 1 4
#> 6 5.5 5 6
#> 7 7.5 8 7
#> 8 2.0 8 8
#> 9 2.0 3 9
#> 10 6.0 2 10
#> 11 7.0 0 11
#> 12 3.0 9 12
#> 13 8.5 4 13
#> 14 9.0 4 14
#> 15 11.0 7 15
#> 16 3.0 3 16
#> 17 3.0 5 17
#> 18 11.0 5 18
#> 19 12.0 2 19
#> 20 2.0 4 20
#> 21 13.5 6 21
#> 22 3.0 6 22
I would use a data.table approach here. It should work well with larger data and it avoids looping over your data, where you dont need it.
library(data.table)
dt <- data.table(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22))
dt[, var1 := as.numeric(var1)]
dt[is.na(var1), var1 := apply(.SD, 1, mean), .SDcols =c("var2", "var3")]
dt
var1 var2 var3
1: 1.0 1 4
2: 2.0 8 1
3: 5.0 9 1
4: 3.0 1 4
5: 3.0 1 4
6: 5.5 5 6
7: 7.5 8 7
8: 2.0 8 8
9: 2.0 3 9
10: 6.0 2 10
11: 7.0 0 11
12: 3.0 9 12
13: 8.5 4 13
14: 9.0 4 14
15: 11.0 7 15
16: 3.0 3 16
17: 3.0 5 17
18: 11.0 5 18
19: 12.0 2 19
20: 2.0 4 20
21: 13.5 6 21
22: 3.0 6 22
So, I'm working with a data frame that has daily data over a period of 444 days. I have several variables that I want to lag for use in a regression model (lm). I want to lag them 7 times each. I'm currently generating the lags like this...
email_data$email_reach1 <- lag(ts(email_data$email_reach, start = 1, end = 444), 1)
email_data$email_reach2 <- lag(ts(email_data$email_reach, start = 1, end = 444), 2)
email_data$email_reach3 <- lag(ts(email_data$email_reach, start = 1, end = 444), 3)
email_data$email_reach4 <- lag(ts(email_data$email_reach, start = 1, end = 444), 4)
email_data$email_reach5 <- lag(ts(email_data$email_reach, start = 1, end = 444), 5)
email_data$email_reach6 <- lag(ts(email_data$email_reach, start = 1, end = 444), 6)
email_data$email_reach7 <- lag(ts(email_data$email_reach, start = 1, end = 444), 7)
Then, I repeat this for every single variable I want to lag.
This seems like a terrible way of accomplishing this. Is there something better?
I've thought about lagging the entire data frame, which works, but I don't know how you'd assign variable names to the result and merge it back to the original data frame.
You can also use data.table. (HT to #akrun)
set.seed(1)
email_data <- data.frame(dates=1:10, email_reach=rbinom(10, 10, 0.5))
library(data.table)
setDT(email_data)[, paste0('email_reach', 1:3) := shift(email_reach, 1:3)][]
# dates email_reach email_reach1 email_reach2 email_reach3
# 1: 1 4 NA NA NA
# 2: 2 4 4 NA NA
# 3: 3 5 4 4 NA
# 4: 4 7 5 4 4
# 5: 5 4 7 5 4
# 6: 6 7 4 7 5
# 7: 7 7 7 4 7
# 8: 8 6 7 7 4
# 9: 9 6 6 7 7
#10: 10 3 6 6 7
Another approach is to use the xts library. A little example follows, we start out with:
x <- ts(matrix(rnorm(100),ncol=2), start=c(2009, 1), frequency=12)
head(x)
Series 1 Series 2
[1,] -1.82934747 -0.1234372
[2,] 1.08371836 1.3365919
[3,] 0.95786815 0.0885484
[4,] 0.59301446 -0.6984993
[5,] -0.01094955 -0.3729762
[6,] -0.19256525 0.3137705
Convert it to xts, an call lag(), here with 0,1,2 lags to minimize output:
library(xts)
head(lag(as.xts(x),0:2))
Series.1 Series.2 Series.1.1 Series.2.1 Series.1.2 Series.2.2
jan 2009 -1.82934747 -0.1234372 NA NA NA NA
feb 2009 1.08371836 1.3365919 -1.82934747 -0.1234372 NA NA
mar 2009 0.95786815 0.0885484 1.08371836 1.3365919 -1.8293475 -0.1234372
apr 2009 0.59301446 -0.6984993 0.95786815 0.0885484 1.0837184 1.3365919
maj 2009 -0.01094955 -0.3729762 0.59301446 -0.6984993 0.9578682 0.0885484
jun 2009 -0.19256525 0.3137705 -0.01094955 -0.3729762 0.5930145 -0.6984993
I think this does the same as your code above, for any given n.
n <- 7
for (i in 1:n) {
email_data[[paste0("email_reach", i)]] <- lag(ts(email_data$email_reach, start = 1, end = 444), i)
}
This isn't really an answer, just using the answer format as an elaboration of my warning above:
email_data <- data.frame( email_reach=ts(email_data$email_reach, start = 1, end = 444))
Then your code and this is what you get:
> head(email_data, 10)
email_reach email_reach1 email_reach2 email_reach3 email_reach4
1 4 4 4 4 4
2 4 4 4 4 4
3 5 5 5 5 5
4 7 7 7 7 7
5 4 4 4 4 4
6 7 7 7 7 7
7 7 7 7 7 7
8 6 6 6 6 6
9 6 6 6 6 6
10 3 3 3 3 3
email_reach5 email_reach6 email_reach7
1 4 4 4
2 4 4 4
3 5 5 5
4 7 7 7
5 4 4 4
6 7 7 7
7 7 7 7
8 6 6 6
9 6 6 6
10 3 3 3
Based on the answer by Molx, but generalized for any list of variables, and patched up a bit... Thanks Molx!
do_lag <- function(the_data, variables, num_periods) {
num_vars <- length(variables)
num_rows <- nrow(the_data)
for (j in 1:num_vars) {
for (i in 1:num_periods) {
the_data[[paste0(variables[j], i)]] <- c(rep(NA, i), head(the_data[[variables[j]]], num_rows - i))
}
}
return(the_data)
}
collapse::flag provides a general and fast (C++ based) solution to this problem:
library(collapse)
# Time-series (also supports xts and others)
head(flag(AirPassengers, -1:2))
## F1 -- L1 L2
## Jan 1949 118 112 NA NA
## Feb 1949 132 118 112 NA
## Mar 1949 129 132 118 112
## Apr 1949 121 129 132 118
## May 1949 135 121 129 132
## Jun 1949 148 135 121 129
# Time-series matrix
head(flag(EuStockMarkets, -1:2))
## Time Series:
## Start = c(1991, 130)
## End = c(1998, 169)
## Frequency = 260
## F1.DAX DAX L1.DAX L2.DAX F1.SMI SMI L1.SMI L2.SMI F1.CAC CAC L1.CAC L2.CAC F1.FTSE FTSE L1.FTSE L2.FTSE
## 1991.496 1613.63 1628.75 NA NA 1688.5 1678.1 NA NA 1750.5 1772.8 NA NA 2460.2 2443.6 NA NA
## 1991.500 1606.51 1613.63 1628.75 NA 1678.6 1688.5 1678.1 NA 1718.0 1750.5 1772.8 NA 2448.2 2460.2 2443.6 NA
## 1991.504 1621.04 1606.51 1613.63 1628.75 1684.1 1678.6 1688.5 1678.1 1708.1 1718.0 1750.5 1772.8 2470.4 2448.2 2460.2 2443.6
## 1991.508 1618.16 1621.04 1606.51 1613.63 1686.6 1684.1 1678.6 1688.5 1723.1 1708.1 1718.0 1750.5 2484.7 2470.4 2448.2 2460.2
## 1991.512 1610.61 1618.16 1621.04 1606.51 1671.6 1686.6 1684.1 1678.6 1714.3 1723.1 1708.1 1718.0 2466.8 2484.7 2470.4 2448.2
## 1991.515 1630.75 1610.61 1618.16 1621.04 1682.9 1671.6 1686.6 1684.1 1734.5 1714.3 1723.1 1708.1 2487.9 2466.8 2484.7 2470.4
# Data frame
head(flag(airquality[1:3], -1:2))
## F1.Ozone Ozone L1.Ozone L2.Ozone F1.Solar.R Solar.R L1.Solar.R L2.Solar.R F1.Wind Wind L1.Wind L2.Wind
## 1 36 41 NA NA 118 190 NA NA 8.0 7.4 NA NA
## 2 12 36 41 NA 149 118 190 NA 12.6 8.0 7.4 NA
## 3 18 12 36 41 313 149 118 190 11.5 12.6 8.0 7.4
## 4 NA 18 12 36 NA 313 149 118 14.3 11.5 12.6 8.0
## 5 28 NA 18 12 NA NA 313 149 14.9 14.3 11.5 12.6
## 6 23 28 NA 18 299 NA NA 313 8.6 14.9 14.3 11.5
# Panel lag
head(flag(iris[1:2], -1:2, iris$Species))
## Panel-lag computed without timevar: Assuming ordered data
## F1.Sepal.Length Sepal.Length L1.Sepal.Length L2.Sepal.Length F1.Sepal.Width Sepal.Width L1.Sepal.Width L2.Sepal.Width
## 1 4.9 5.1 NA NA 3.0 3.5 NA NA
## 2 4.7 4.9 5.1 NA 3.2 3.0 3.5 NA
## 3 4.6 4.7 4.9 5.1 3.1 3.2 3.0 3.5
## 4 5.0 4.6 4.7 4.9 3.6 3.1 3.2 3.0
## 5 5.4 5.0 4.6 4.7 3.9 3.6 3.1 3.2
## 6 4.6 5.4 5.0 4.6 3.4 3.9 3.6 3.1
Similarly collapse::fdiff and collapse::fgrowth support stuitably lagged /leaded and iterated (quasi-, log-) differences and growth rates on (multivariate) time series and panels.
I have the following dummy data set:
ID TIME DDAY DV
1 0 50 6.6
1 12 50 6.1
1 24 50 5.6
1 48 50 7.6
2 0 10 6.6
2 12 10 6.6
2 24 10 6.6
2 48 10 6.6
3 0 50 3.6
3 12 50 6.8
3 24 50 9.6
3 48 50 7.1
4 0 10 8.6
4 12 10 6.4
4 24 10 4.6
4 48 10 5.6
I want to create summary table for mean and standard deviations for DV as shown below:
N TIME DDAY MEAN-DV SD-DV
2 0 50 6.5 1.1
2 12 50 6.1 0.8
2 24 50 4.5 2.0
2 48 50 7.5 1.0
2 0 10 6.9 1.5
2 12 10 8.5 1.3
2 24 10 6.1 0.9
2 48 10 4.5 1.8
How do I do this in R?
You can use:
1) dplyr:
library(dplyr)
dat %.%
group_by(TIME, DDAY) %.%
summarise(MEAN_DV = mean(DV), SD_DV = sd(DV), N = length(DV))
# TIME DDAY MEAN_DV SD_DV N
# 1 48 10 6.10 0.7071068 2
# 2 24 10 5.60 1.4142136 2
# 3 12 10 6.50 0.1414214 2
# 4 0 10 7.60 1.4142136 2
# 5 48 50 7.35 0.3535534 2
# 6 24 50 7.60 2.8284271 2
# 7 12 50 6.45 0.4949747 2
# 8 0 50 5.10 2.1213203 2
where dat is the name of your data frame.
2) data.table:
library(data.table)
DT <- as.data.table(dat)
DT[ , list(MEAN_DV = mean(DV), SD_DV = sd(DV), N = .N), by = c("TIME", "DDAY")]
# TIME DDAY MEAN_DV SD_DV N
# 1: 0 50 5.10 2.1213203 2
# 2: 12 50 6.45 0.4949747 2
# 3: 24 50 7.60 2.8284271 2
# 4: 48 50 7.35 0.3535534 2
# 5: 0 10 7.60 1.4142136 2
# 6: 12 10 6.50 0.1414214 2
# 7: 24 10 5.60 1.4142136 2
# 8: 48 10 6.10 0.7071068 2
require(plyr)
# THIS COLLAPSES ON TIME
ddply(df, .(TIME), summarize, MEAN_DV=mean(DV), SD_DV=sd(DV), N=length(DV))
# THIS COLLAPSES ON TIME AND DDAY
ddply(df, .(TIME, DDAY), summarize, MEAN_DV=mean(DV), SD_DV=sd(DV), N=length(DV))
I have a data set of individuals with their socioeconomic scores, ranging from -6.3 to 3.5. Now I want to assign each individual to their quantiles based on their socioeconomic score.
I have a dataset named Healthdata with two columns: Healthdata$SSE, and Healthdata$ID.
Eventually, I would like to get a data frame matched by their SSE quantiles.
How can I do this in R?
Here's one approach:
# an example data set
set.seed(1)
Healthdata <- data.frame(SSE = rnorm(8), ID = gl(2, 4))
transform(Healthdata, quint = ave(SSE, ID, FUN = function(x) {
quintiles <- quantile(x, seq(0, 1, .2))
cuts <- cut(x, quintiles, include.lowest = TRUE)
quintVal <- quintiles[match(cuts, levels(cuts)) + 1]
return(quintVal)
}))
# SSE ID quint
# 1 -0.6264538 1 -0.4644344
# 2 0.1836433 1 0.7482983
# 3 -0.8356286 1 -0.7101237
# 4 1.5952808 1 1.5952808
# 5 0.3295078 2 0.3610920
# 6 -0.8204684 2 -0.1304827
# 7 0.4874291 2 0.5877873
# 8 0.7383247 2 0.7383247
A simple illustration of how it works:
values <- 1:10
# [1] 1 2 3 4 5 6 7 8 9 10
quintiles <- quantile(values, seq(0, 1, .2))
# 0% 20% 40% 60% 80% 100%
# 1.0 2.8 4.6 6.4 8.2 10.0
cuts <- cut(values, quintiles, include.lowest = TRUE)
# [1] [1,2.8] [1,2.8] (2.8,4.6] (2.8,4.6]
# [5] (4.6,6.4] (4.6,6.4] (6.4,8.2] (6.4,8.2]
# [9] (8.2,10] (8.2,10]
# 5 Levels: [1,2.8] (2.8,4.6] ... (8.2,10]
quintVal <- quintiles[match(cuts, levels(cuts)) + 1]
# 20% 20% 40% 40% 60% 60% 80% 80% 100% 100%
# 2.8 2.8 4.6 4.6 6.4 6.4 8.2 8.2 10.0 10.0
So let's start with a sample data set based on your description:
set.seed(315)
Healthdata <- data.frame(SSE = sample(-6.3:3.5, 21, replace=TRUE), ID = gl(7, 3))
Which gives something like this:
> Healthdata[1:15,]
SSE ID
1 -0.3 1
2 -6.3 2
3 -1.3 3
4 -3.3 4
5 -5.3 5
6 -4.3 6
7 -4.3 7
8 0.7 8
9 -4.3 9
10 -4.3 10
11 -3.3 11
12 0.7 12
13 -2.3 13
14 -3.3 14
15 0.7 15
I understand that you want a new variable which identifies the quantile group of the individual's socioeconomic status. I would do something like this:
transform(Healthdata, Q = cut(Healthdata$SSE,
breaks = quantile(Healthdata$SSE),
labels = c(1, 2, 3, 4),
include.lowest=TRUE))
To return:
SSE ID Q
1 -1.3 1 2
2 -6.3 2 1
3 -4.3 3 1
4 0.7 4 3
5 1.7 5 3
6 1.7 6 3
7 -5.3 7 1
8 1.7 8 3
9 2.7 9 4
10 -3.3 10 2
11 -1.3 11 2
12 -3.3 12 2
13 1.7 13 3
14 0.7 14 3
15 -4.3 15 1
If you want to see the upper and lower bounds for the quantile ranges, omit the labels = c(1, 2, 3, 4) to return this instead:
SSE ID Q
1 -1.3 1 (-4.3,-1.3]
2 -6.3 2 [-6.3,-4.3]
3 -4.3 3 [-6.3,-4.3]
4 0.7 4 (-1.3,1.7]
5 1.7 5 (-1.3,1.7]