data.table do not compute NA groups in by - r

This question has a partial answer here but the question is too specific and I'm not able to apply it to my own problem.
I would like to skip a potentially heavy computation of the NA group when using by.
library(data.table)
DT = data.table(X = sample(10),
Y = sample(10),
g1 = sample(letters[1:2], 10, TRUE),
g2 = sample(letters[1:2], 10, TRUE))
set(DT, 1L, 3L, NA)
set(DT, 1L, 4L, NA)
set(DT, 6L, 3L, NA)
set(DT, 6L, 4L, NA)
DT[, mean(X*Y), by = .(g1,g2)]
Here we can see there are up to 5 groups including the (NA, NA) group. Considering that (i) the group is useless (ii) the groups can be very big and (iii) the actual computation is more complex than mean(X*Y) can I skip the group in an efficient way? I mean, without creating a copy of the remaining table. Indeed the following works.
DT2 = data.table:::na.omit.data.table(DT, cols = c("g1", "g2"))
DT2[, mean(X*Y), by = .(g1,g2)]

You can use an if clause:
DT[, if (!anyNA(.BY)) mean(X*Y), by = .(g1,g2)]
g1 g2 V1
1: b a 25.75000
2: a b 24.00000
3: b b 35.33333
From the ?.BY help:
.BY is a list containing a length 1 vector for each item in by. This can be useful [...] to branch with if() depending on the value of a group variable.

Related

How to conditionally summarize on other entries in the group - R

In my dataset I have Cartesian coordinates of different items overtime identified by an EventID, event_type, ID number, x position, y position, identity type, broad category, and frame id number. What I need to do is go for each EventID, event_type pair, and frame id number go through each ID number and calculate which other ID number with a different broad category has the minimum distance from the current row. I would like to avoid using for loops for this because the dataset is several million lines long.
I tried formulating this as a group_by and summarize call using dplyr but couldn't quite wrap my head around how I could call a function on the current row x, an y against all other x, and ys and then choose the conditional minimum.
two_dim_euclid = function(x1, x2, y1, y2){
a <- sqrt((x1 - x2)^2 + (y1 - y2)^2)
return(a)
}
# Example Data
df <- data.frame(stringsAsFactors = FALSE,
EventID = c(1003, 1003, 1003, 1003),
event_type = c(893, 893, 893, 893),
ID_number = c(80427, 2346, 24954, 27765),
x = c(86.07, 72.4, 43.08, 80.13),
y = c(35.58, 26.43, 34.8, 34.79),
identity_type = c("A", "C", "B", "B"),
broad_category = c("set1", "set1", "set2", "set2"),
frame_id = c(1, 1, 1, 1))
df
# EventID event_type ID_number x y identity_type broad_category frame_id
#1 1003 893 80427 86.07 35.58 A set1 1
#2 1003 893 2346 72.40 26.43 C set1 1
#3 1003 893 24954 43.08 34.80 B set2 1
#4 1003 893 27765 80.13 34.79 B set2 1
The expected result would return 5.992303 for row 1 it looks for all the entries not belonging to set1 with the same EventID, event_type, and frame_id and then returns the minimum euclidian distance given those parameters.
Also, I want to do this for every entry with identity type A. But, the identity_type and broad_category are not always tied together. A can belong to either set1 or set2.
Here's a base way that relies on dist().
res <- as.matrix(dist(cbind(df$x, df$y)))
res[res == 0] <- Inf
apply(res, 1, min)
1 2 3 4
5.992303 11.386066 30.491299 5.992303
# or potentially more performant
res[cbind(seq_len(nrow(res)), max.col(-res))]
[1] 5.992303 11.386066 30.491299 5.992303
A potential way with data.table would be to do a cartesian join but it would need a lot of memory and would likely be slower:
library(data.table)
dt <- as.data.table(df)
dt[, ID := .I]
CJ.dt = function(X,Y) {
stopifnot(is.data.table(X),is.data.table(Y))
k = NULL
X = X[, c(k=1, .SD)]
setkey(X, k)
Y = Y[, c(k=1, .SD)]
setkey(Y, NULL)
X[Y, allow.cartesian=TRUE][, k := NULL][]
}
CJ.dt(dt, dt)[ID != i.ID, min(sqrt((x - i.x)^2 + (y-i.y)^2)), by = i.ID]
i.ID V1
1: 1 5.992303
2: 2 11.386066
3: 3 30.491299
4: 4 5.992303
For data.table cartesian join, see here:
R: data.table cross-join not working
While I'm not sure about your criteria, it seems that you MUST use for loops in some way if you want to iterate. I'm sure others can provide you with Rcpp solutions that are very quick. In the meantime, here is one possible way with base R.
# In the future, please provide the code to create your example data
dat <- structure(list(EventID = c(1003L, 1003L, 1003L, 1003L),
event_type = c(893L, 893L, 893L, 893L),
ID_number = c(80427L, 2346L, 24954L, 27765L),
x = c(86.07, 72.4, 43.08, 80.13),
y = c(35.58, 26.43, 34.8, 34.79),
identity_type = structure(c(1L, 3L, 2L, 2L),
.Label = c("A", "B", "C"),
class = "factor"),
broad_category = structure(c(1L, 1L, 2L, 2L),
.Label = c("set1", "set2"),
class = "factor"),
frame_id = c(1L, 1L, 1L, 1L)),
.Names = c("EventID", "event_type", "ID_number","x", "y",
"identity_type", "broad_category", "frame_id"),
class = "data.frame", row.names = c("1", "2", "3", "4"))
# Define your criteria here
dat$uniqueID <- paste0(dat$EventID, dat$event_type, dat$frame_id, dat$broad_category)
# made your function have two 2 dim vectors instead since that's simpler for passing in
two_dim_euclid = function(a, b) return(sqrt((a[1] - b[1])^2 + (a[2] - b[2])^2))
n <- nrow(dat)
vec <- numeric(n)
for(i in 1:n){
vec[i] = sum(apply(dat[dat$uniqueID != dat$uniqueID[i], c("x","y")], 1,
function(r) two_dim_euclid(dat[i,c("x","y")], r)), na.rm = T)
if(i%%10000 == 0) cat(i,"completed...\n") # Progress check since >1mil rows
}
dat$result <- vec

Get single column of values comparing multiple columns

I have just started my journey with R. I want to test values across multiple columns for the same condition and return 5 if any of the values is "hello" within a row:
result = ifelse((myData[1] == "hello") | (myData[2] == "hello") | (myData[3] == "hello"), 5, 0)
This works fine, but code seems to be redundant. When I do:
resultSec = ifelse(myData[1:3] == "hello", 5, 0)
Then all 3 columns are checked against the condition, but the result I get is not a single column, but 3 columns. So then I would have to perform an additional comparison for all columns which makes totally more lines of code then the first redundant method.
How can I get in this case a one column of values in efficient way ?
You can use the function apply() to iterate over a data.frame or matrix, by either columns or rows. The margin argument determines which one you use.
Here we want to check the rows, so we use margin = 1:
dat <- data.frame(col1 = c("happy", "sad", "mad"),
col2 = c("tired", "sleepy", "happy"),
col3 = c("relaxed", "focused", "fine"))
dat$res <- apply(X = dat, MARGIN = 1,
FUN = function(x) ifelse("happy" %in% x, 5, 0))
dat
col1 col2 col3 res
1 happy tired relaxed 5
2 sad sleepy focused 0
3 mad happy fine 5
We can use rowSums here
df1$res <- rowSums(df1 == "happy") * 5
df1$res
#[1] 5 0 5
data
df1 <- structure(list(col1 = structure(c(1L, 3L, 2L), .Label = c("happy",
"mad", "sad"), class = "factor"), col2 = structure(c(3L, 2L,
1L), .Label = c("happy", "sleepy", "tired"), class = "factor"),
col3 = structure(c(3L, 2L, 1L), .Label = c("fine", "focused",
"relaxed"), class = "factor")), .Names = c("col1", "col2",
"col3"), row.names = c(NA, -3L), class = "data.frame")

Apply a function over several columns

I am trying to use values from a look up table, to multiply corresponding values in a main table.
This is an example of some data
The look up
lu = structure(list(year = 0:12, val = c(1.6422, 1.6087, 1.5909, 1.4456,
1.4739, 1.4629, 1.467, 1.4619, 1.2588, 1.1233, 1.1664, 1.1527,
1.2337)), .Names = c("year", "val"), class = "data.frame", row.names = c(NA,
-13L))
Main data
dt = structure(list(year = c(3L, 4L, 6L, 10L, 3L, 9L, 10L, 7L, 7L,
1L), x = 1:10, y = 1:10), .Names = c("year", "x", "y"), row.names = c(NA,
-10L), class = c("data.table", "data.frame"))
I can produce the results I want by merging and then multiplying one column at a time
library(data.table)
dt = merge(dt, lu, by = "year")
dt[, xnew := x*val][, ynew := y*val]
However, I have many variables to apply this over. There have been many questions on this, but I cannot get it to work.
Using ideas from How to apply same function to every specified column in a data.table , and R Datatable, apply a function to a subset of columns , I tried
dt[, (c("xnew", "ynew")):=lapply(.SD, function(i) i* val), .SDcols=c("x", "y")]
Error in FUN(X[[i]], ...) : object 'val' not found
for (j in c("x", "y")) set(dt, j = j, value = val* dat[[j]])
Error in set(dt, j = j, value = val * dt[[j]]) : object 'val' not found
And just trying the multiplication without assigning (from Data table - apply the same function on several columns to create new data table columns) also didnt work.
dt[, lapply(.SD, function(i) i* val), .SDcols=c("x", "y")]
Error in FUN(X[[i]], ...) : object 'val' not found
Please could you point out my error. Thanks.
Im using data.table version v1.9.6.
We can try by join and then by specifying .SDcols
dt[lu, on = .(year), nomatch =0
][, c("x_new", "y_new") := lapply(.SD, `*`, val), .SDcols = x:y][]

R Creating Dynamic variables from group aggregated set of DataFrames

My problem statement is I have a list of dataframes as df1,df2,df3.Data is like
df1
a,b,c,d
1,2,3,4
1,2,3,4
df2
a,b,c,d
1,2,3,4
1,2,3,4
Now, for these two dataframe I should create a new dataframe taking aggregated column of those two dataframes ,for that I am using below code
for(i in 1:2){
assign(paste(final_val,i,sep=''),sum(assign(paste(df,i,sep='')))$d*100)}
I am getting the error:
Error in assign(paste(hvp_route_dsct_clust, i, sep = "")) :
argument "value" is missing, with no default
My output should look like
final_val1 <- 800
final_val2 <- 800
And for those values final_val1,final_val2 I should be creating dataframe dynamicaly
Can anybody please help me on this
If we need to use assign, get the object names from the global environment with ls by specifying the pattern 'df' followed by one or more numbers (\\d+), create another vector of 'final_val's ('nm1'), loop through the sequence of 'nm1', assign each of the element in 'nm2' to the value we got from extracting the column 'd' of each 'df's multiplied by 100 and taking its sum.
nm1 <- ls(pattern = "df\\d+")
nm2 <- paste0("final_val", seq_along(nm1))
for(i in seq_along(nm1)){
assign(nm2[i], sum(get(nm1[i])$d*100))
}
final_val1
#[1] 800
final_val2
#[1] 800
Otherwise, we place the datasets in a list, extract the 'd' column, multiply with 100 and do the column sums
unname(colSums(sapply(mget(nm1), `[[`, 'd') * 100))
#800 800
data
df1 <- structure(list(a = c(1L, 1L), b = c(2L, 2L), c = c(3L, 3L), d = c(4L,
4L)), .Names = c("a", "b", "c", "d"), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(a = c(1L, 1L), b = c(2L, 2L), c = c(3L, 3L), d = c(4L,
4L)), .Names = c("a", "b", "c", "d"), class = "data.frame", row.names = c(NA,
-2L))

R: remove columns based on two column's similarity check

Input
row.no column2 column3 column4
1 bb ee up
2 bb ee down
3 bb ee up
4 bb yy down
5 bb zz up
I have a rule to remove row 1 and 2 and 3, as while column2 and column3 for row 1, 2 and 3 are the same, contradictory data (up and down) are found in column 4.
How can I ask R to remove those rows with same name in column2 and column3 but contracting column 3 to result a matrix as follows:
row.no column2 column3 column4
4 bb yy down
5 bb zz up
The functions in package plyr really shine at this type of problem. Here is a solution using two lines of code.
Set up the data (kindly provided by #GavinSimpson)
dat <- structure(list(row.no = 1:5, column2 = structure(c(1L, 1L, 1L,
1L, 1L), .Label = "bb", class = "factor"), column3 = structure(c(1L,
1L, 1L, 2L, 3L), .Label = c("ee", "yy", "zz"), class = "factor"),
column4 = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("down",
"up"), class = "factor")), .Names = c("row.no", "column2",
"column3", "column4"), class = "data.frame", row.names = c(NA,
-5L))
Load the plyr package
library(plyr)
Use ddply to split, analyse and combine dat. The following line of code analyses splits dat into unique combination of (column2 and column3) separately. I then add a column called unique, which calculates the number of unique values of column4 for each set. Finally, use a simple subsetting to return only those lines where unique==1, and drop column 5.
df <- ddply(dat, .(column2, column3), transform,
row.no=row.no, unique=length(unique(column4)))
df[df$unique==1, -5]
And the results:
row.no column2 column3 column4
4 4 bb yy down
5 5 bb zz up
Here is one potential, if somewhat inelegant, solution
out <- with(dat, split(dat, interaction(column2, column3)))
out <- lapply(out, function(x) if(NROW(x) > 1) {NULL} else {data.frame(x)})
out <- out[!sapply(out, is.null)]
do.call(rbind, out)
Which gives:
> do.call(rbind, out)
row.no column2 column3 column4
bb.yy 4 bb yy down
bb.zz 5 bb zz up
Some explanation, line by line:
Line 1: splits the data into a list, each component of which is a data frame with rows corresponding to groups formed by unique combinations of column2 and column3.
Line 2: iterate over the result from Line 1; if there are more than 1 row in data frame, return NULL, if not return the 1-row data frame.
Line 3: iterate over the output from Line 2; return only non-NULL components
Line 4: need to bind, row-wise, the output from Line 3, which we arrange via do.call()
This can be simplified to two lines, combining Lines 1-3 into a single line:
out <- lapply(with(dat, split(dat, interaction(column2, column3))),
function(x) if(NROW(x) > 1) {NULL} else {data.frame(x)})
do.call(rbind, out[!sapply(out, is.null)])
The above was all done with:
dat <- structure(list(row.no = 1:5, column2 = structure(c(1L, 1L, 1L,
1L, 1L), .Label = "bb", class = "factor"), column3 = structure(c(1L,
1L, 1L, 2L, 3L), .Label = c("ee", "yy", "zz"), class = "factor"),
column4 = structure(c(2L, 1L, 2L, 1L, 2L), .Label = c("down",
"up"), class = "factor")), .Names = c("row.no", "column2",
"column3", "column4"), class = "data.frame", row.names = c(NA,
-5L))
Gavin keeps raising the bar on the quality of answers. Here's my attempt.
# This is one way of importing the data into R
sally <- textConnection("row.no column2 column3 column4
1 bb ee up
2 bb ee down
3 bb ee up
4 bb yy down
5 bb zz up")
sally <- read.table(sally, header = TRUE)
# Order the data frame to make rle work its magic
sally <- sally[order(sally$column3, sally$column4), ]
# Find which values are repeating
sally.rle2 <- rle(as.character(sally$column2))
sally.rle3 <- rle(as.character(sally$column3))
sally.rle4 <- rle(as.character(sally$oclumn4))
sally.can.wait2 <- sally.rle2$values[which(sally.rle3$lengths != 1)]
sally.can.wait3 <- sally.rle3$values[which(sally.rle3$lengths != 1)]
sally.can.wait4 <- sally.rle4$values[which(sally.rle4$lengths != 1)]
# Find which lines have values that are repeating
dup <- c(which(sally$column2 == sally.can.wait2),
which(sally$column3 == sally.can.wait3),
which(sally$column4 == sally.can.wait4))
dup <- dup[duplicated(dup)]
# Display the lines that have no repeating values
sally[-dup, ]
You can try one of the following two methods. Suppose the table is called 'table1'.
Method 1
repeated_rows = c();
for (i in 1:(nrow(table1)-1)){
for (j in (i+1):nrow(table1)){
if (sum((table1[i,2:3] == table1[j,2:3])) == 2){
repeated_rows = c(repeated_rows, i, j)
}
}
}
repeated_rows = unique(repeated_rows)
table1[-repeated_rows,]
Method 2
duplicates = duplicated(table1[,2:3])
for (i in 1:length(duplicates)){
if (duplicates[i] == TRUE){
for (j in 1:nrow(table1)){
if (sum(table1[i,2:3] == table1[j,2:3]) == 2){
duplicates[j] = TRUE;
}
}
}
}
table1[!duplicates,]

Resources