I have a list of dataframes in R, each loaded from a different file containing ranks. For example the files could contain finishing positions for different athletes in different races.
The same element (athlete) can appear in more than one dataframe (race) but no dataframe will necessarily contain all elements.
I would like to populate a matrix of rankings with athletes as rows and races as columns. Where there is no ranking for an athlete in a particular race it should read 0.
For example, if I have:
[[1]]
name rank
1 Alice 1
2 Bob 2
3 Carla 3
4 Diego 4
[[2]]
name rank
1 Alice 2
2 Carla 1
3 Eric 3
4 Frank 4
5 Gary 5
[[3]]
name rank
1 Bob 5
2 Carla 4
3 Diego 3
4 Eric 1
5 Gary 2
I would like to generate a matrix:
1 2 3
Alice 1 2 0
Bob 2 0 5
Carla 3 1 4
Diego 4 0 3
Eric 0 3 1
Frank 0 4 0
Gary 0 5 2
I am looking for an efficient way to do this: my data is more like 200 dataframes and 10000 ranked elements per dataframe (15000 unique elements in total) so the final matrix will be approx 15000x200
Here's a solution using reshape2 package:
require(reshape2)
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
where ll is your list of data.frames.
or equivalently:
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
A data.table solution:
require(data.table)
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:3)][is.na(rank), rank := 0L][, as.list(rank), by = name]
name V1 V2 V3
1: Alice 1 2 0
2: Bob 2 0 5
3: Carla 3 1 4
4: Diego 4 0 3
5: Eric 0 3 1
6: Frank 0 4 0
7: Gary 0 5 2
Some benchmarking (now that we've quite some answers):
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
require(reshape2)
require(data.table)
Arun1_reshape2 <- function(ll) {
# same as #agstudy's
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
}
Arun2_reshape2 <- function(ll) {
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
}
eddi_reshape2 <- function(ll) {
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
}
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
merge.all <- function(x, y) {
merge(x, y, all=TRUE, by="name")
}
Hong_Ooi <- function(ll) {
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
}
require(microbenchmark)
microbenchmark( arun1 <- Arun1_reshape2(ll),
arun2 <- Arun2_reshape2(ll),
eddi <- eddi_reshape2(ll),
hong <- Hong_Ooi(ll),
arun.dt <- Arun_data.table(ll), times=10)
Unit: seconds
expr min lq median uq max neval
arun1 <- Arun1_reshape2(ll) 9.157160 9.177143 9.366775 9.715767 28.043125 10
arun2 <- Arun2_reshape2(ll) 8.408356 8.437066 8.494233 9.018796 10.075029 10
eddi <- eddi_reshape2(ll) 8.056605 8.314110 8.402396 8.474129 9.124581 10
hong <- Hong_Ooi(ll) 82.457432 82.716930 82.908646 108.413217 321.164598 10
arun.dt <- Arun_data.table(ll) 2.006474 2.123331 2.212783 2.311619 2.738914 10
Here's a simpler reshape2 solution:
library(reshape2)
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
# name 1 2 3
#1 Alice 1 2 0
#2 Bob 2 0 5
#3 Carla 3 1 4
#4 Diego 4 0 3
#5 Eric 0 3 1
#6 Frank 0 4 0
#7 Gary 0 5 2
Arun's benchmarks were pretty interesting, and it seems like what data.table does really well is the melting part, and what reshape2 does really well is the dcast, so here's the best of both worlds:
library(reshape2)
library(data.table)
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
Using Arun's benchmark data:
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
mix_of_both = function(ll) {
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
}
require(microbenchmark)
microbenchmark(Arun_data.table(ll), mix_of_both(ll), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# Arun_data.table(ll) 2568.333 2586.0079 2626.7704 2832.8076 2911.1314 10
# mix_of_both(ll) 615.166 739.9383 766.8994 788.5822 821.0478 10
here the data since the OP don't give a reproducible example :
dput(ll)
list(structure(list(name = structure(1:4, .Label = c("Alice",
"Bob", "Carla", "Diego"), class = "factor"), rank = 1:4), .Names = c("name",
"rank"), class = "data.frame", row.names = c("1", "2", "3", "4"
)), structure(list(name = structure(1:5, .Label = c("Alice",
"Carla", "Eric", "Frank", "Gary"), class = "factor"), rank = c(2L,
1L, 3L, 4L, 5L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")), structure(list(name = structure(1:5, .Label = c("Bob",
"Carla", "Diego", "Eric", "Gary"), class = "factor"), rank = c(5L,
4L, 3L, 1L, 2L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")))
Nearly the same solution as #Arun one, but in 2 separtes steps:
## add race column
ll <- lapply(seq_along(ll),function(x){
ll[[x]]$race <- x
ll[[x]]
})
## create a long data.frame
dd <- do.call(rbind,ll)
## transform to the wide format
library(reshape2)
dcast(name~race,data=dd,fill=0,value.var='rank')
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
Another Reduce use case, it seems.
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="name")
# to avoid problems with merged name clashes
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
You'll have to modify your data frames slightly to avoid merge complaining about name collisions; a for loop works as well as anything for this purpose.
Any missing races will have NA. You can replace them with 0 by out[is.na(out)] <- 0; you should ask yourself whether this is sensible though. For example, if you do this, then simple summary statistics like means, variances etc will give misleading results. The same holds if you want to do any more complicated modelling. By contrast, most R modelling functions will be smart enough to exclude NAs.
Related
I am doing some operations on a data.table and getting a result. So far so good. Next, I want the result to also show the sums across some columns, but I can't get that to work.
I filter my table by rows where x1=1, and compute a metric by Group1:
dt[x1 == 1, .N, by = c("Group1")][,
"%" := round(N /sum(N) * 100, 0)] [
]
giving
Group1 N %
1: 2 6 40
2: 1 6 40
3: 3 2 13
4: 5 1 7
I would just like to add a row to the above table that gives the sum across all columns.
I can just do
colSums(.Last.value)
and get the answer in a in a separate console, but what if I wanted to just append a new row to the above table itself, something like:
Group1 N %
1: 2 6 40
2: 1 6 40
3: 3 2 13
4: 5 1 7
ColSum: -- 15 100
Since I don't understand your sample dataset, I guess this can help solve your issue.
I would suggest that you use the janitor package to wrap up your column total or row total
See sample below
library(janitor)
set.seed(10)
df_sample<- sample(1:nrow(iris), 10)
df<-iris[df_sample, ]
#This would sum all the rows together and return total
df%>%
select(Species,Sepal.Width, Petal.Length, Petal.Width)%>%
adorn_totals(where = "row")
#This would sum all columns and return total
df%>%
select(Species,Sepal.Width, Petal.Length, Petal.Width)%>%
adorn_totals(where = "col")
I hope that this answered your question.
As a hacked mod to akrun's (since deleted) answer, here's a custom printing function that works around data.table's omission of row names.
prettyDT <- function(x, ...) {
out <- capture.output(data.table:::print.data.table(x, ...))
nms <- rownames(x)
gre <- gregexpr("^([0-9]+)(?=:)", out, perl = TRUE)
newnms <- nms[as.integer(regmatches(out, gre), nms)]
wids <- nchar(newnms)
newnms[!is.na(wids)] <- sprintf(paste0("%", max(wids, na.rm = TRUE), "s"), newnms[!is.na(wids)])
regmatches(out, gre)[!is.na(wids)] <- newnms[!is.na(wids)]
pre <- strrep(" ", diff(range(wids, na.rm = TRUE)))
out[is.na(wids)] <- paste0(pre, out[is.na(wids)])
cat(out, sep = "\n")
}
With this, we can do:
out <- rbindlist(list(
DT,
DT[, c(.(Group1 = "--"), lapply(.SD, sum)), .SDcols = c("N", "%")]
))
rownames(out)[nrow(out)] <- "Colsum"
prettyDT(out)
# Group1 N %
# <char> <int> <int>
# 1: 2 6 40
# 2: 1 6 40
# 3: 3 2 13
# 4: 5 1 7
# Colsum: -- 15 100
Admittedly, this is a bit of a hack, and requires explicit calling of a udf to get the desired output.
Data
DT <- setDT(structure(list(Group1 = c("2", "1", "3", "5"), N = c(6L, 6L, 2L, 1L), "%" = c(40L, 40L, 13L, 7L)), class = c("data.table", "data.frame"), row.names = c(NA, -4L)))
I have a dataset with names, dates, and several categorical columns. Let's say
data <- data.table(name = c('Anne', 'Ben', 'Cal', 'Anne', 'Ben', 'Cal', 'Anne', 'Ben', 'Ben', 'Ben', 'Cal'),
period = c(1,1,1,1,1,1,2,2,2,3,3),
category = c("A","A","A","B","B","B","A","B","A","B","A"))
Which looks like this:
name period category
Anne 1 A
Ben 1 A
Cal 1 A
Anne 1 B
Ben 1 B
Cal 1 B
Anne 2 A
Ben 2 B
Ben 2 A
Ben 3 A
Cal 3 B
I want to compute, for each period, how many names were present in the past period, for every group of my categorical variables. The output should be as follows:
period category recurrence_count
2 A 2 # due to Anne and Ben being on A, period 1
2 B 1 # due to Ben being on B, period 1
3 A 1 # due to Ben being on A, period 2
3 B 0 # no match from B, period 2
I am aware of the .I and .GRP operators in data.table, but I have no idea how to write the notion of 'next group' in the j entry of my statement. I imagine something like this might be a reasonable path, but I can't figure out the correct syntax:
data[, .(recurrence_count = length(intersect(name, name[last(.GRP)]))), by = .(category, period)]
You can first summarize your data by category and period.
previous_period_names <- data[, .(names = list(name)), .(category, period)]
previous_period_names[, next_period := period + 1]
Join your summary with your original data.
data[previous_period_names, names := i.names, on = c('period==next_period')]
Now count how many names you see the name in the summarized names
data[, .(recurrence_count = sum(name %in% unlist(names))), by = .(period, category)]
Another data.table alternative. For rows that can have a previous period (period != 1), create such a variable (prev_period := period - 1).
Join original data with a subset that has values for 'prev_period' (data[data[!is.na(prev_period)]). Join on 'category', 'period = prev_period' and 'name'.
In the resulting data set, for each 'period' and 'category' (by = .(period = i.period, category)), count the number of names from original data (x.name) that had a match with previous period (length(na.omit(x.name))).
data[period != 1, prev_period := period - 1]
data[data[!is.na(prev_period)], on = c("category", period = "prev_period", "name"),
.(category, i.period, x.name)][
, .(n = length(na.omit(x.name))), by = .(period = i.period, category)]
# period category n
# 1: 2 A 2
# 2: 2 B 1
# 3: 3 B 1
# 4: 3 A 0
One option in base R is to split the 'data' by 'category', then loop over the list (lapply), use Reduce with intersect on the splitted 'name' by 'period' with accumulate as TRUE, get the lengths of the list, create a data.frame with the unique elements of 'period' and use Map to create the 'category' from the names of the list output, rbind the list of data.frame into a single dataset
library(data.table)
lst1 <- lapply(split(data, data$category), function(x)
data.frame(period = unique(x$period)[-1],
recurrence_count = lengths(Reduce(intersect,
split(x$name, x$period), accumulate = TRUE)[-1])))
rbindlist(Map(cbind, category = names(lst1), lst1))[
order(period), .(period, category, recurrence_count)]
# period category recurrence_count
#1: 2 A 2
#2: 2 B 1
#3: 3 A 1
#4: 3 B 0
Or using the same logic within data.table, grouped by 'category, do the split of 'name' by 'period' and apply the Reduce with intersect
setDT(data)[, .(period = unique(period),
recurrence_count = lengths(Reduce(intersect,
split(name, period), accumulate = TRUE))), .(category)][duplicated(category)]
# category period recurrence_count
#1: A 2 2
#2: A 3 1
#3: B 2 1
#4: B 3 0
Or similar option in tidyverse
library(dplyr)
library(purrr)
data %>%
group_by(category) %>%
summarise(reccurence_count = lengths(accumulate(split(name, period),
intersect)), period = unique(period), .groups = 'drop' ) %>%
filter(duplicated(category))
# A tibble: 4 x 3
# category reccurence_count period
# <chr> <int> <int>
#1 A 2 2
#2 A 1 3
#3 B 1 2
#4 B 0 3
data
data <- structure(list(name = c("Anne", "Ben", "Cal", "Anne", "Ben",
"Cal", "Anne", "Ben", "Ben", "Ben", "Cal"), period = c(1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), category = c("A", "A", "A",
"B", "B", "B", "A", "B", "A", "A", "B")), class = "data.frame",
row.names = c(NA,
-11L))
A data.table option
setDT(df)[
,
{
u <- split(name, period)
data.table(
period = unique(period)[-1],
recurrence_count = lengths(
Map(
intersect,
head(u, -1),
tail(u, -1)
)
)
)
},
category
]
gives
category period recurrence_count
1: A 2 2
2: A 3 1
3: B 2 1
4: B 3 0
I want to Transform R Dataframe factor into Indicator Variable using some index in R.
Given following representation
StudentID Subject
1 A
1 B
2 A
2 C
3 A
3 B
I need following representation using StudentID as index
StudentID SubjectA SubjectB SubjectC
1 1 1 0
2 1 0 1
3 1 1 0
We can use table
table(df1)
# Subject
#StudentID A B C
# 1 1 1 0
# 2 1 0 1
# 3 1 1 0
If we need a data.frame
as.data.frame.matrix(table(df1))
Here's how I got it, using dcast from reshape2 as suggested in the comment above
library(reshape2)
ID <- c(1, 1, 2, 2, 3, 3)
Subject <- c('A', 'B', 'A', 'C', 'A', 'B')
data <- data.frame(ID, Subject)
data <- dcast(data, ID ~ Subject)
data[is.na(data)] <- 0
f <- function(x) {
x <- gsub('[A-Z]', 1, x)
}
as.data.frame(apply(data, 2, f))
# ID A B C
#1 1 1 1 0
#2 2 1 0 1
#3 3 1 1 0
Now that I look at this solution it may not be very efficient. But it is much more dynamic than some other solutions. There might also be a way to use data.table directly but I cannot figure it out. This might help though:
library(data.table)
df <- structure(list(StudentID = c(1, 1, 2, 2, 3, 3),
Subject = structure(c(1L,
2L, 1L, 3L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("StudentID",
"Subject"), row.names = c(NA, -6L), class = "data.frame")
df <- data.table(df)
### here we pull the unique student id's to use in group by
studentid <- as.character(unique(df$Subject))
### here we group by student ID's and paste which Subjects exist
x <- df[,list("Values"=paste(Subject,collapse="_")),by=StudentID]
### then we go through each one and try to match it to the unique vector
tmp <- strsplit(x$Values,"_")
res <- do.call(rbind,lapply(tmp,function(i) match(studentid,i)))
### change the results to the indicator variable desired
res[!is.na(res)] <- 1
res[is.na(res)] <- 0
res <- data.frame("StudentID"=x$StudentID,res)
colnames(res) <- c("StudentID",studentid)
I have a dataset that is roughly structured like this:
case Year 2001 2002 2003 2004
1 2003 0 0 0 3
2 2002 0 5 3 2
3 2001 3 3 2 2
I am trying to restructure it so that every column represents the first, second (etc.) year counting from the "Year" variable, i.e.:
case Year yr1 yr2 yr3 yr4
1 2003 0 3 0 0
2 2002 5 3 2 0
3 2001 3 3 2 2
This code downloads the dataset and tries the solution suggested by #akrun, but it fails.
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
df1[-(1:2)] <- do.call(rbind,lapply(seq_len(nrow(df1)), function(i) {x <- df1[i, ]; x1 <- unlist(x[-(1:2)]); indx <- which(!is.na(x1))[1]; i <- as.numeric(names(indx))-x[,2]+1; x2 <- x1[!is.na(x1)]; x3 <- rep(NA, length(x1)); x3[i:(i+length(x2)-1)]<- x2; x3}))
This generates:
Error in i:(i + length(x2) - 1) : NA/NaN argument
In addition: Warning message:
In FUN(1:234[[1L]], ...) : NAs introduced by coercion
How can I transform the data so that every column represents the first, second (etc.) year counting from the value in the "Year" variable for each row?
Here's a possibilty:
library(dplyr)
library(reshape2)
df %>%
melt(id.vars = c("case", "Year")) %>%
mutate(variable = as.numeric(as.character(variable)),
yr = variable - Year + 1) %>%
filter(variable >= Year) %>%
dcast(case + Year ~ yr, fill = 0)
# case Year 1 2 3 4
# 1 1 2003 0 3 0 0
# 2 2 2002 5 3 2 0
# 3 3 2001 3 3 2 2
Data:
df <- structure(list(case = 1:3, Year = c(2003L, 2002L, 2001L), `2001` = c(0L,
0L, 3L), `2002` = c(0L, 5L, 3L), `2003` = c(0L, 3L, 2L), `2004` = c(3L,
2L, 2L)), .Names = c("case", "Year", "2001", "2002", "2003",
"2004"), class = "data.frame", row.names = c(NA, -3L))
This should create the manipulation you are looking for.
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
temp <- df1[[1]]
library(dplyr); library(tidyr); library(stringi)
temp <- temp %>%
gather(new.Years, X, -Year) %>% # convert rows to one column
mutate(Year.temp=paste0(rownames(temp), "-", Year)) %>% # concatenate the Year with row number to make them unique
mutate(new.Years = as.numeric(gsub("X", "", new.Years)), diff = new.Years-Year+1) %>% # calculate the difference to get the yr0 yr1 and so on
mutate(diff=paste0("yr", stri_sub(paste0("0", (ifelse(diff>0, diff, 0))), -2, -1))) %>% # convert the differences in Yr01 ...
select(-new.Years) %>% filter(diff != "yr00") %>% # drop new.Years column
spread(diff, X) %>% # convert column to rows
select(-Year.temp) # Drop Year.temp column
temp[is.na(temp)] <- 0 # replace NA with 0
temp %>% View
Notice that this will work for up to 99 years.
Here's a data.table solution:
require(data.table)
require(reshape2)
dt.m = melt(dt, id = 1:2, variable.factor = FALSE)
dt.m[, variable := as.integer(variable)-Year+1L]
dcast.data.table(dt.m, case + Year ~ variable, fill=0L,
value.var = "value", subset = (variable > 0L))
# case Year 1 2 3 4
# 1: 1 2003 0 3 0 0
# 2: 2 2002 5 3 2 0
# 3: 3 2001 3 3 2 2
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")$value
I have an X in the colnames and remove it:
colnames(df1) <- gsub("X", "", colnames(df1))
I have got a solution without any additional packages:
startYear <- as.numeric(colnames(df1)[2])
shifts <- df1$Year - startYear
n <- ncol(df1)
df2 <- df1
colnames(df2)[-1] <- 1:(n-1)
df2[,2:n] <- NA
for(row in 1:nrow(df1)){
if(shifts[row]>=0){
df2[row,2:(n-shifts[row])] <- df1[row, (shifts[row]+2):n]
#df2[row,2:(n-shifts[row])] <- colnames(df1)[(shifts[row]+2):n]
}else{
df2[row, (-shifts[row]+2):n] <- df1[row, 2:(n+shifts[row])]
#df2[row, (-shifts[row]+2):n] <- colnames(df1)[2:(n+shifts[row])]
}
}
You can prefill df2 with 0 instead of NA of corse. Decomment second rows and comment first rows in the ifelse condition to validate the permutation.
Hope it does what you wanted.
I am trying to replace the NAs in "test" with the forecast values in "forecast". I am trying to use match, but I can't figure it out. keep in mind the id and time create a two-part unique id. Any suggestions? ( keep in mind my data set is much larger than this example (rows=32000))
test = data.frame(id =c(1,1,1,2,2,2), time=c(89,99,109,89,99,109), data=c(3,4,NA,5,2,NA))
forecast = data.frame(id =c(1,2), time=c(109,109), data=c(5,1))
Desired output
out = data.frame(id =c(1,1,1,2,2,2), time=c(89,99,109,89,99,109), data=c(3,4,5,5,2,1))
Here is the data.table solution
test_dt <- data.table(test, key = c('id', 'time'))
forecast_dt <- data.table(test, key = c('id', 'time'))
forecast[test][,data := ifelse(is.na(data), data.1, data)]
EDIT. Benchmarking Tests: Data Table is ~ 3x faster even for a small dataset.
library(rbenchmark)
f_merge <- function(){
out2 <- merge(test, forecast, by = c("id", "time"), all.x = TRUE)
out2 <- transform(out2,
newdata = ifelse(is.na(data.x), data.y, data.x), data.x = NULL, data.y = NULL)
return(out2)
}
f_dtable <- function(){
test <- data.table(test, key = c('id', 'time'))
forecast <- data.table(forecast, key = c('id', 'time'))
test <- forecast[test][,data := ifelse(is.na(data), data.1, data)]
test$data.1 <- NULL
return(test)
}
benchmark(f_merge(), f_dtable(), order = 'relative',
columns = c('test', 'elapsed', 'relative'))
test elapsed relative
2 f_dtable() 0.86 1.00
1 f_merge() 2.26 2.63
I would use merge to join the data together and then compute your new column in two steps:
out2 <- merge(test, forecast, by = c("id", "time"), all.x = TRUE)
> out2
id time data.x data.y
1 1 89 3 NA
2 1 99 4 NA
3 1 109 NA 5
4 2 89 5 NA
5 2 99 2 NA
6 2 109 NA 1
#Compute new variable and clean up old ones:
out2 <- transform(out2, newdata = ifelse(is.na(data.x), data.y, data.x), data.x = NULL, data.y = NULL)
> out2
id time newdata
1 1 89 3
2 1 99 4
3 1 109 5
4 2 89 5
5 2 99 2
6 2 109 1
Try this:
test$data[is.na(test$data)] <- forecast[((forecast$id %in% test$id) & (forecast$time %in% test$time)),]$data