Loops in custom R function to transform data - r

I use this code to create a sample dataframe of events:
set.seed(100)
mydf <-data.frame(time=(1:100),
status = sample(c('OK','UNKNOWN'),1000,replace=TRUE),
event = sample(1:10,1000,replace=TRUE)
)
The data looks like this:
head(mydf)
time status event
1 1 OK 1
2 2 OK 2
3 3 UNKNOWN 7
4 4 OK 7
5 5 OK 4
6 6 UNKNOWN 2
I would like to create a new dataset like this:
StartTime EndTime SeqID Sequence
1 1 3 1 {1,2,7}
2 4 6 2 {7,4,2}
Essentially I'd like to create a column named Sequence that is an array of the events, but I'd like to start over after the status column is equal to UNKNOWN. I've tried a for loop with a while loop, but no success.

Here's a data.table solution:
library(data.table);
dt <- as.data.table(mydf);
dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=cumsum(status=='UNKNOWN')+1L)];
## SeqID StartTime EndTime Sequence
## 1: 1 1 2 1,2
## 2: 2 3 6 7,7,4,2
## 3: 3 7 8 1,5
## 4: 4 9 10 6,10
## 5: 5 11 11 4
## ---
## 513: 513 90 92 7,3,5
## 514: 514 93 93 2
## 515: 515 94 95 8,10
## 516: 516 96 99 3,2,3,1
## 517: 517 100 100 7
I believe you've made a mistake with your expected output. If the sequence starts over every time the status column is equal to UNKNOWN, then the first array should be 1,2 rather than 1,2,7.
Update: If you want the sequence to start over in the row after the status column equalled UNKNOWN, then you can do this:
dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=c(0L,cumsum(status[-length(status)]=='UNKNOWN'))+1L)];
## SeqID StartTime EndTime Sequence
## 1: 1 1 3 1,2,7
## 2: 2 4 7 7,4,2,1
## 3: 3 8 9 5,6
## 4: 4 10 11 10, 4
## 5: 5 12 12 2
## ---
## 512: 512 89 90 2,7
## 513: 513 91 93 3,5,2
## 514: 514 94 94 8
## 515: 515 95 96 10, 3
## 516: 516 97 100 2,3,1,7
Note that your expected output is still incorrect; the second group should be 7,4,2,1 rather than 7,4,2 under this design. Edit: Actually, I think perhaps the issue is with a discrepancy in mydf; I get this with your sample creation code:
head(mydf,10L);
## time status event
## 1 1 OK 1
## 2 2 OK 2
## 3 3 UNKNOWN 7
## 4 4 OK 7
## 5 5 OK 4
## 6 6 OK 2
## 7 7 UNKNOWN 1
## 8 8 OK 5
## 9 9 UNKNOWN 6
## 10 10 OK 10
Please try running your sample creation code again with the seed of 100. We should be getting the same result for mydf.
Here's a base R solution built around by():
with(list(SeqID=c(0L,cumsum(mydf$status[-nrow(mydf)]=='UNKNOWN'))+1L),
do.call(rbind,by(cbind(mydf,SeqID),SeqID,function(x)
data.frame(
SeqID=x$SeqID[1L],
StartTime=x$time[1L],
EndTime=x$time[length(x$time)],
Sequence=I(list(x$event))
)
))
);
## SeqID StartTime EndTime Sequence
## 1 1 1 3 1, 2, 7
## 2 2 4 7 7, 4, 2, 1
## 3 3 8 9 5, 6
## 4 4 10 11 10, 4
## 5 5 12 12 2
##
## ... snip ...
##
## 512 512 89 90 2, 7
## 513 513 91 93 3, 5, 2
## 514 514 94 94 8
## 515 515 95 96 10, 3
## 516 516 97 100 2, 3, 1, 7
Benchmarking
library(data.table);
library(microbenchmark);
bgoldst1 <- function(dt) dt[,.(StartTime=time[1L],EndTime=time[length(time)],Sequence=list(event)),.(SeqID=c(0L,cumsum(status[-length(status)]=='UNKNOWN'))+1L)];
bgoldst2 <- function(mydf) with(list(SeqID=c(0L,cumsum(mydf$status[-nrow(mydf)]=='UNKNOWN'))+1L),do.call(rbind,by(cbind(mydf,SeqID),SeqID,function(x) data.frame(SeqID=x$SeqID[1L],StartTime=x$time[1L],EndTime=x$time[length(x$time)],Sequence=I(list(x$event))))));
lebatsnok <- function(mydf) { mydfs <- split(mydf, head(cumsum(c("", mydf$status) == "UNKNOWN"), -1)); res <- lapply(mydfs, function(x) data.frame(StartTime = x$time[1], EndTime = tail(x$time,1), SeqID = NA, Sequence = paste(x$event, collapse=","))); res <- do.call(rbind, res); res$SeqID <- seq_len(NROW(res)); res; };
set.seed(100L);
mydf <- data.frame(time=1:100,status=sample(c('OK','UNKNOWN'),1000L,T),event=sample(1:10,1000L,T),stringsAsFactors=F);
dt <- as.data.table(mydf);
ex <- as.data.frame(bgoldst1(dt)); o <- names(ex);
all.equal(ex,bgoldst2(mydf)[o],check.attributes=F);
## [1] TRUE
all.equal(transform(ex,Sequence=factor(sapply(Sequence,paste,collapse=','))),lebatsnok(mydf)[o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst1(dt),bgoldst2(mydf),lebatsnok(mydf));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst1(dt) 1.363785 1.671909 1.896345 1.839763 2.041828 3.900621 100
## bgoldst2(mydf) 217.960902 234.978058 244.491406 243.867674 251.392438 298.083774 100
## lebatsnok(mydf) 254.961413 273.434086 284.439844 283.864322 291.889867 337.319627 100

A base R solution (relies on stringsAsFactors being FALSE, so mydf is redefined):
set.seed(100)
mydf <-data.frame(time=(1:100),
status = sample(c('OK','UNKNOWN'),1000,replace=TRUE),
event = sample(1:10,1000,replace=TRUE), stringsAsFactors=FALSE
)
mydfs <- split(mydf, head(cumsum(c("", mydf$status) == "UNKNOWN"), -1))
res <- lapply(mydfs, function(x)
data.frame(StartTime = x$time[1],
EndTime = tail(x$time,1),
SeqID = NA,
Sequence = paste(x$event, collapse=",")))
res <- do.call(rbind, res)
res$SeqID <- seq_len(NROW(res))
head(res)
# StartTime EndTime SeqID Sequence
# 0 1 3 1 1,2,7
# 1 4 7 2 7,4,2,1
# 2 8 9 3 5,6
# 3 10 11 4 10,4
# 4 12 12 5 2
# 5 13 15 6 10,1,8

Related

looping based on same values in one column and storing the results in a new dataframe [duplicate]

This question already has answers here:
How to calculate the mean in a data frame using aggregate function in R?
(3 answers)
How to sum a variable by group
(18 answers)
Closed 5 years ago.
to explain my problem, I have created the following df:
hh_01 <- c(rep(1:4, each = 3), rep(5:10, each = 5))
vill <- c(rep(100, 12), rep(101, 30))
hh_02 <- c(2:4, 1, 3, 4, 1:2, 4, 1:3, 6:10, 5, 7:10, 5:6, 8:10, 5:7, 9:10, 5:8, 10, 5:9)
set.seed(1); dist <- abs(rnorm(42, mean = 0, sd = 1000))
df <- matrix(c(hh_01, vill, hh_02, dist), nrow = 42, ncol = 4)
colnames(df) <- c("hh_01", "vill", "hh_02", "dist")
df <- as.data.frame(df)
df
hh_01 vill hh_02 dist
1 1 100 2 1728.39791
2 1 100 3 979.05280
3 1 100 4 972.09301
4 2 100 1 461.72457
5 2 100 3 384.84236
6 2 100 4 523.10665
7 3 100 1 482.88891
8 3 100 2 218.27501
9 3 100 4 878.32424
10 4 100 1 41.75679
11 4 100 2 967.72103
12 4 100 3 661.80881
13 5 101 6 851.74364
14 5 101 7 852.48595
15 5 101 8 471.51824
16 5 101 9 862.90742
17 5 101 10 750.57410
18 6 101 5 1714.03797
19 6 101 7 93.43975
20 6 101 8 640.15912
21 6 101 9 601.66437
22 6 101 10 969.44271
23 7 101 5 77.95871
24 7 101 6 604.71114
25 7 101 8 169.18386
26 7 101 9 435.42663
27 7 101 10 604.22278
28 8 101 5 475.18935
29 8 101 6 13.09895
30 8 101 7 2873.04565
31 8 101 9 1019.03810
32 8 101 10 41.51445
33 9 101 5 914.63453
34 9 101 6 67.62432
35 9 101 7 85.45653
36 9 101 8 971.21044
37 9 101 10 2074.87280
38 10 101 5 98.43913
39 10 101 6 437.63773
40 10 101 7 620.47573
41 10 101 8 376.56226
42 10 101 9 1013.93106
My task: for all hh_01 with the same value calculate the mean of dist and save the result in a new df with the following structure:
hh_01 vill mean_dist
1 100 1226.515
2 100 .......
I know I have to use the for loop (or maybe alternatively sapply/lapply or ) but I don´t know how to finish this command...
for (i in seq(along=df[,df$hh_01])){
ifelse(df$hh_01[i] == df$hh_01[i+1])
}
I know these are basics in programming (not just in R) but i´m not a programmer and pretty new in this area...)
I would appreciate any help.
The simpler the code the better for me (please with short explanation). I would like to understand this kind of looping (or looping in general) because I have to work with this type of questions very often in the future.
Thank you very much.
You can also use aggregate:
dfnew<-aggregate(df[c("hh_01","vill","dist")],by=list(df$hh_01),mean)[-1]
The dplyr package is great help here.
library(dplyr)
new_df <- group_by(df, hh_01, vill)
new_df <- summarize(new_df, mean_dist=mean(dist))
Example output:
hh_01 vill mean_dist
<dbl> <dbl> <dbl>
1 1 100 666.0538
2 2 100 720.5532
A great dplyr cheatsheet is found here: http://nbviewer.jupyter.org/github/rstudio/cheatsheets/blob/master/source/pdfs/data-transformation-cheatsheet.pdf
summarize is a vectorized function - summarize takes care of efficient looping for you.
Here is a version using dplyr package - although I get a different result from you. One of the important characteristics of R is that many functions are vectorized which loosely means they can operate on a whole structure without having to use a for or apply construct (the for or apply is hidden within the function). Note also the simplified way to create a dataframe.
set.seed = 123
df <- data.frame(
hh_01 = c(rep(1:4, each = 3), rep(5:10, each = 5)),
vill = c(rep(100, 12), rep(101, 30)),
hh_02 = c(2:4, 1, 3, 4, 1:2, 4, 1:3, 6:10, 5, 7:10, 5:6, 8:10, 5:7, 9:10, 5:8, 10, 5:9),
dist = abs(rnorm(42, mean = 0, sd = 1000))
)
library(dplyr)
df2 <- df %>%
group_by(hh_01, vill) %>%
summarize(mean_dist = mean(dist))
df2
# hh_01 vill mean_dist
# < int> <dbl> <dbl>
# 1 1 100 1265.9534
# 2 2 100 855.2477
# 3 3 100 840.0750
# 4 4 100 876.0722
# 5 5 101 574.8193
# 6 6 101 559.2385
# 7 7 101 1177.1751
# 8 8 101 765.6921
# 9 9 101 438.8936
# 10 10 101 331.3354

Efficient method to convert set of ranges in a dataframe to frequency of individual elements?

I am working in R. I have a data-frame which contains the start and end positions on a chromosome (where integer represents a coordinate on the chromosome) Ex:
start end
1 5
3 7
4 10
12 7 (inverted is also allowed)
8 15
What I want is to count how many times a coordinate is present in all these ranges. So, for the above example, the output would be this:
position count
1 1
2 1
3 2
4 3
5 3
6 2
7 3
8 3
9 3
10 3
11 2
12 2
13 1
14 1
15 1
I have 62000+ such ranges, where each range is at least 1000 positions long. I know how to do this conversion but I don't know how to do this efficiently, that is with in couple of seconds.
Current (inefficient code)
positions <- c()
for(i in seq(nrow(a))){
positions <- c(positions, seq(a[i,3], a[i,4]))
}
table(positions)
"a" is my data-frame and the start and end coordinates are in the third and forth column respectively.
One of the columns in the data-frame contains characters, so for using apply I would either need to create a new data-frame (consuming extra space) or would need to convert to integers inside the apply function (extra time). Sorry, for not informing about this earlier.
For a very fast code with data.table see the answer from docendo discimus
(+ benchmark)
Here is the benchmark of some other solutions:
set.seed(42)
N <- 1000
df <- data.frame(start=sample.int(10*N, N))
df$end <- df$start + sample(3:20, N, rep=TRUE)
library("microbenchmark")
microbenchmark(unit = "relative",
ori = { positions <- c()
for(i in seq(nrow(df))){
positions <- c(positions, seq(df[i,1], df[i,2]))
}
table(positions) },
a = table(unlist(apply(df, 1, function(x) x[1]:x[2]))), # my solution, similar: KenS, EricSchutte
m1 = table(unlist(mapply(seq, df$start, df$end))), # my variant of Sotos' solution
m2 = table(unlist(mapply(':', df$start, df$end))), # my variant of Sotos' solution
M1 = table(unlist(Map(seq, df$start, df$end))), # my variant of Sotos' solution
M2 = table(unlist(Map(':', df$start, df$end))), # Sotos
l = table(unlist(lapply(seq_len(nrow(df)), function(i) seq(df$start[i], df$end[i])))), # lmo
t = { temp <- unlist(lapply(seq_len(nrow(df)), function(i) seq(df$start[i], df$end[i]))) # lmo tabulate()
cbind(sort(unique(temp)), tabulate(temp)) },
d = table(do.call(c, mapply(seq, df$start, df$end))), # #989 (comment to the answer from Sotos)
dd = table(do.call(c, mapply(seq.int, df$start, df$end))), # docendo discimus (comment to this answer)
f = { pos <- data.frame(x=(min(df):max(df)),n=0) # Andrew Gustar
for(i in seq_along(df$start)){
low=min(df$start[i])-pos$x[1]+1
high=max(df$end[i])-pos$x[1]+1
pos$n[low:high] <- pos$n[low:high]+1
} }
)
# Unit: relative
# expr min lq mean median uq max neval cld
# ori 7.163767 7.219099 7.573688 7.379160 7.912435 7.899586 100 e
# a 1.194627 1.194855 1.211432 1.209485 1.213118 1.711994 100 a
# m1 1.645659 1.660294 1.711141 1.686973 1.710461 2.217141 100 b
# m2 1.005302 1.007125 1.017115 1.009618 1.017207 1.576201 100 a
# M1 1.642688 1.645174 1.733173 1.673924 1.686253 2.218028 100 b
# M2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
# l 3.487924 3.512732 3.801530 3.665725 4.188701 4.216375 100 d
# t 2.670636 2.711345 2.961449 2.869190 3.066150 3.745984 100 c
# d 1.652376 1.650798 1.721377 1.665901 1.712064 2.187129 100 b
# dd 1.040941 1.045652 1.060601 1.047534 1.053305 1.592163 100 a
# f 8.287098 8.486854 9.052884 9.046376 9.126318 25.210722 100 f
The solution with tabulate() produces warnings.
One idea,
as.data.frame(table(unlist(Map(`:`, df$start, df$end))))
# Var1 Freq
#1 1 1
#2 2 1
#3 3 2
#4 4 3
#5 5 3
#6 6 2
#7 7 3
#8 8 3
#9 9 3
#10 10 3
#11 11 2
#12 12 2
#13 13 1
#14 14 1
#15 15 1
This is roughly the same algorithm you are using, but should be faster.
myNums <- unlist(lapply(seq_len(nrow(df)), function(i) seq(df$start[i], df$end[i])))
table(myNums)
myNums
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1 1 2 3 3 2 3 3 3 3 2 2 1 1 1
An even faster method would be to use tabulate rather than table. For example,
temp <- unlist(lapply(seq_len(nrow(df)), function(i) seq(df$start[i], df$end[i])))
cbind(sort(unique(temp)), tabulate(temp))
which returns the matrix
[,1] [,2]
[1,] 1 1
[2,] 2 1
[3,] 3 2
[4,] 4 3
[5,] 5 3
[6,] 6 2
[7,] 7 3
[8,] 8 3
[9,] 9 3
[10,] 10 3
[11,] 11 2
[12,] 12 2
[13,] 13 1
[14,] 14 1
[15,] 15 1
runs about 50% faster for the given data set.
Unit: microseconds
expr min lq mean median uq max neval cld
table 223.233 237.6305 250.0329 245.8985 253.4545 423.944 100 b
tabulate 142.835 159.0860 166.9775 167.3540 175.7650 195.009 100 a
I'll propose a data.table solution since we're interested in performance. The approach looks like this:
library(data.table)
setDT(df)
df[, list(seq.int(start, end)), by = 1:nrow(df)][, .N, by = V1]
And it performs very well compared to the other solutions despite by-row operation.
Here's a benchmark on 1e4 rows:
set.seed(42)
N <- 1e4
vals = 1:100
df <- data.frame(start=sample(vals, N, replace = TRUE), end = sample(vals, N, replace = TRUE))
library(data.table)
library("microbenchmark")
dt <- copy(df)
setDT(dt)
microbenchmark(unit = "relative", times = 10,
jogo = table(unlist(Map(seq, df$start, df$end))), # jogo
sotos = table(unlist(Map(':', df$start, df$end))), # Sotos
lmo = table(unlist(lapply(seq_len(nrow(df)), function(i) seq(df$start[i], df$end[i])))), # lmo
orig_989 = table(do.call(c, mapply(seq, df$start, df$end))), # #989 (comment to the answer from Sotos)
mod_989 = table(do.call(c, mapply(seq.int, df$start, df$end))), # docendo discimus (comment to this answer)
dd = dt[, list(seq.int(start, end)), by = 1:nrow(dt)][, .N, by = V1]
)
Unit: relative
expr min lq mean median uq max neval cld
jogo 8.794179 8.735461 19.226146 8.584978 8.637774 52.782168 10 ab
sotos 10.669810 10.623685 8.984351 10.437937 10.164045 4.846189 10 ab
lmo 21.319154 21.117393 27.452902 22.558436 22.913901 43.403024 10 b
orig_989 9.190209 8.725191 7.532509 8.730023 8.516305 3.948500 10 ab
mod_989 5.372087 5.097636 5.067462 5.305532 6.214493 3.188091 10 ab
dd 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
I create a sequence for every line in df, e.g. c(1,2,3,4,5) for the first line. by using:
all.pos <- apply(df, 1, function(x){x[1]:x[2]})
all.pos <- unlist(all.pos)
# 1 2 3 4 5 3 4 5 6 7 4 5 6 7 8 9 10 12 11 10 9 8 7 8 9
# 10 11 12 13 14 15
Table will count how often every position occurs in all.pos.
table(all.pos)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# 1 1 2 3 3 2 3 3 3 3 2 2 1 1 1
Another approach, avoiding table, looping through the ranges and adding 1 to the relevant values each time...
ranges <- data.frame(start=c(1,3,4,12,8), end=c(5,7,10,7,15) )
pos <- data.frame(x=(min(ranges):max(ranges)),n=0)
for(i in seq_along(ranges$start)){
low=min(ranges$start[i])-pos$x[1]+1
high=max(ranges$end[i])-pos$x[1]+1
pos$n[low:high] <- pos$n[low:high]+1
}
pos
x n
1 1 1
2 2 1
3 3 2
4 4 3
5 5 3
6 6 2
7 7 3
8 8 3
9 9 3
10 10 3
11 11 2
12 12 2
13 13 1
14 14 1
15 15 1
First thing that came up, might not be the best but by using apply things should significantly go faster.
df <- data.frame(start=c(1,3,4,12,8), end=c(5,7,10,7,15) )
positions <- apply(df, 1, function (x) {
seq(x[1], x[2])
})
table(unlist(positions))
yields..
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1 1 2 3 3 2 3 3 3 3 2 2 1 1 1

remove rows based on substraction results

I have a large data set like this:
df <- data.frame(group = c(rep(1, 6), rep(5, 6)), score = c(30, 10, 22, 44, 6, 5, 20, 35, 2, 60, 14,5))
group score
1 1 30
2 1 10
3 1 22
4 1 44
5 1 6
6 1 5
7 5 20
8 5 35
9 5 2
10 5 60
11 5 14
12 5 5
...
I want to do a subtraction for each neighboring score within each group, if the difference is greater than 30, remove the smaller score. For example, within group 1, 30-10=20<30, 10-22=-12<30, 22-44=-22<30, 44-6=38>30 (remove 6), 44-5=39>30 (remove 5)... The expected output should look like this:
group score
1 1 30
2 1 10
3 1 22
4 1 44
5 5 20
6 5 35
7 5 60
...
Does anyone have idea about realizing this?
Like this?
repeat {
df$diff=unlist(by(df$score,df$group,function(x)c(0,-diff(x))))
if (all(df$diff<30)) break
df <- df[df$diff<30,]
}
df$diff <- NULL
df
# group score
# 1 1 30
# 2 1 10
# 3 1 22
# 4 1 44
# 7 5 20
# 8 5 35
# 10 5 60
This (seems...) to require an iterative approach, because the "neighboring score" changes after removal of a row. So before you remove 6, the difference 44 - 6 > 30, but 6 - 5 < 30. After you remove 6, the difference 44 - 5 > 30.
So this calculates difference between successive rows by group (using by(...) and diff(...)), and removes the appropriate rows, then repeats the process until all differences are < 30.
It's not elegant but it should work:
out = data.frame(group = numeric(), score=numeric())
#cycle through the groups
for(g in levels(as.factor(df$group))){
temp = subset(df, df$group==g)
#now go through the scores
left = temp$score[1]
for(s in seq(2, length(temp$score))){
if(left - temp$score[s] > 30){#Test the condition
temp$score[s] = NA
}else{
left = temp$score[s] #if condition not met then the
}
}
#Add only the rows without NAs to the out
out = rbind(out, temp[which(!is.na(temp$score)),])
}
There should be a way to do this using ave but carrying the last value when removing the next if the diff >30 is tricky! I'd appreciate the more elegant solution if there is one.
You can try
df
## group score
## 1 1 30
## 2 1 10
## 3 1 22
## 4 1 44
## 5 1 6
## 6 1 5
## 7 5 20
## 8 5 35
## 9 5 2
## 10 5 60
## 11 5 14
## 12 5 5
tmp <- df[!unlist(tapply(df$score, df$group, FUN = function(x) c(F, -diff(x) > 30), simplify = T)), ]
while (!identical(df, tmp)) {
df <- tmp
tmp <- df[!unlist(tapply(df$score, df$group, FUN = function(x) c(F, -diff(x) > 30), simplify = T)), ]
}
tmp
## group score
## 1 1 30
## 2 1 10
## 3 1 22
## 4 1 44
## 7 5 20
## 8 5 35
## 10 5 60

Using one data frame to sum a range of data from another data frame in R

I'm migrating from SAS to R. I need help figuring out how to sum up weather data for date ranges. In SAS, I take the date ranges, use a data step to create a record for every date (with startdate, enddate, date) in the range, merge with weather and then summarize (VAR hdd cdd; CLASS=startdate enddate sum=) to sum up the values for the date range.
R code:
startdate <- c(100,103,107)
enddate <- c(105,104,110)
billperiods <-data.frame(startdate,enddate);
to get:
> billperiods
startdate enddate
1 100 105
2 103 104
3 107 110
R code:
weatherdate <- c(100:103,105:110)
hdd <- c(0,0,4,5,0,0,3,1,9,0)
cdd <- c(4,1,0,0,5,6,0,0,0,10)
weather <- data.frame(weatherdate,hdd,cdd)
to get:
> weather
weatherdate hdd cdd
1 100 0 4
2 101 0 1
3 102 4 0
4 103 5 0
5 105 0 5
6 106 0 6
7 107 3 0
8 108 1 0
9 109 9 0
10 110 0 10
Note: weatherdate = 104 is missing. I may not have weather for a day.
I can't figure out how to get to:
> billweather
startdate enddate sumhdd sumcdd
1 100 105 9 10
2 103 104 5 0
3 107 110 13 10
where sumhdd is the sum of the hdd's from startdate to enddate in the weather data.frame.
Any ideas?
Here's a method using IRanges and data.table. Seemingly, for this question, this answer may seem kind of an overkill. But in general, I find it convenient to use IRanges to deal with intervals, how simple they may be.
# load packages
require(IRanges)
require(data.table)
# convert data.frames to data.tables
dt1 <- data.table(billperiods)
dt2 <- data.table(weather)
# construct Ranges to get overlaps
ir1 <- IRanges(dt1$startdate, dt1$enddate)
ir2 <- IRanges(dt2$weatherdate, width=1) # start = end
# find Overlaps
olaps <- findOverlaps(ir1, ir2)
# Hits of length 10
# queryLength: 3
# subjectLength: 10
# queryHits subjectHits
# <integer> <integer>
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 5 1 5
# 6 2 4
# 7 3 7
# 8 3 8
# 9 3 9
# 10 3 10
# get billweather (final output)
billweather <- cbind(dt1[queryHits(olaps)],
dt2[subjectHits(olaps),
list(hdd, cdd)])[, list(sumhdd = sum(hdd),
sumcdd = sum(cdd)),
by=list(startdate, enddate)]
# startdate enddate sumhdd sumcdd
# 1: 100 105 9 10
# 2: 103 104 5 0
# 3: 107 110 13 10
Code breakdown for last line: First I construct using queryHits, subjectHits and cbind a mid-way data.table from which then, I group by startdate, enddate and get the sum of hdd and sum of cdd. It is easier to look at the line separately as shown below for better understanding.
# split for easier understanding
billweather <- cbind(dt1[queryHits(olaps)],
dt2[subjectHits(olaps),
list(hdd, cdd)])
billweather <- billweather[, list(sumhdd = sum(hdd),
sumcdd = sum(cdd)),
by=list(startdate, enddate)]
cbind(billperiods, t(sapply(apply(billperiods, 1, function(x)
weather[weather$weatherdate >= x[1] &
weather$weatherdate <= x[2], c("hdd", "cdd")]), colSums)))
startdate enddate hdd cdd
1 100 105 9 10
2 103 104 5 0
3 107 110 13 10
billweather <- cbind(billperiods,
t(apply(billperiods, 1, function(x) {
colSums(weather[weather[, 1] %in% c(x[1]:x[2]), 2:3])
})))

getting a sample of a data.frame in R

I have the following data frame in R:
id<-c(1,2,3,4,10,2,4,5,6,8,2,1,5,7,7)
date<-c(19970807,19970902,19971010,19970715,19991212,19961212,19980909,19990910,19980707,19991111,19970203,19990302,19970605,19990808,19990706)
spent<-c(1997,19,199,134,654,37,876,890,873,234,643,567,23,25,576)
df<-data.frame(id,date,spent)
I need to take a random sample of 3 customers (based on id) in a way that all observations of the customers be extracted.
You want to use %in% and unique
df[df$id %in% sample(unique(df$id),3),]
## id date spent
## 4 4 19970715 134
## 7 4 19980909 876
## 8 5 19990910 890
## 10 8 19991111 234
## 13 5 19970605 23
Using data.table to avoid $ referencing
library(data.table)
DT <- data.table(df)
DT[id %in% sample(unique(id),3)]
## id date spent
## 1: 1 19970807 1997
## 2: 4 19970715 134
## 3: 4 19980909 876
## 4: 1 19990302 567
## 5: 7 19990808 25
## 6: 7 19990706 576
This ensures that you are always evaluating the expressions within the data.table.
Use something like:
df[sample(df$id, 3), ]
# id date spent
# 1 1 19970807 1997
# 5 10 19991212 654
# 8 5 19990910 890
Of course, your samples would be different.
Update
If you want unique customers, you can aggregate first.
df2 = aggregate(list(date = df$date, spent = df$spent), list(id = df$id), c)
df2[sample(df2$id, 3), ]
# id date spent
# 4 4 19970715, 19980909 134, 876
# 5 5 19990910, 19970605 890, 23
# 8 8 19991111 234
OR--an option with out aggregate:
df[df$id %in% sample(unique(df$id), 3), ]
# id date spent
# 1 1 19970807 1997
# 3 3 19971010 199
# 12 1 19990302 567
# 14 7 19990808 25
# 15 7 19990706 576

Resources