R long-table to multi-dimensional arrays - r

I have a table in the long format:
require(data.table)
sampleDT <- data.table(Old = c("A","B","A","B","A","B","A","B")
, New = c("A","A","B","B","A","A","B","B")
, Time = c(1,1,1,1,2,2,2,2)
, value1 = c(1,1,1,1,1,1,1,1)
, value2 = c(0,0,0,0,0,0,0,0))
print(sampleDT)
Old New Time value1 value2
1: A A 1 1 0
2: B A 1 1 0
3: A B 1 1 0
4: B B 1 1 0
5: A A 2 1 0
6: B A 2 1 0
7: A B 2 1 0
8: B B 2 1 0
I would like to convert to an array of 3 dimensions. Something like:
Basically, we would have columns "New, Old, Time" as our three dimensions.
And the value for each cell is an output of some sort of functions whose input are "value1, value2".
In this case, when Time = 1, the result is:
matrix(data = c(1, 1+0, 0, -0), nrow = 2, ncol = 2, byrow = FALSE)
[,1] [,2]
[1,] 1 0
[2,] 1 0
How to achieve it?
Memory usage and computing time are important considerations, as we're working on relatively big datasets.

Try xtabs():
sampleDT <- data.frame(Old = c("A","B","A","B","A","B","A","B"),
New = c("A","A","B","B","A","A","B","B"),
Time = c(1,1,1,1,2,2,2,2),
value1 = c(1,1,1,1,1,1,1,1),
value2 = c(0,0,0,0,0,0,0,0))
Value1 <- xtabs(value1 ~ Old + New + Time, sampleDT, drop = FALSE)
Value2 <- xtabs(value2 ~ Old + New + Time, sampleDT, drop = FALSE)
is.array(Value1)
is.array(Value2)
Value1[, 2,] <- 0 # Sets all second columns to zero for Value1
Value2[1,,] <- 0 # Idem with first row for Value2
Value2[2,2,] <- Value2[2,2,] * (-1)
Result <- Value1 + Value2
Result
, , Time = 1
New
Old A B
A 1 0
B 1 0
, , Time = 2
New
Old A B
A 1 0
B 1 0
Hope it helps.

Related

Correctly Using the Shift Function in Data.Table

I have this dataset in which students take an exam multiple times over a period of years - a "fail" is a 0 and a "pass" is a 1. The data looks something like this:
# Load the data.table package
library(data.table)
# Generate some sample data
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
# Create a data table from the sample data
my_data = data.table(id, results, date_exam_taken)
my_data <- my_data[order(id, date_exam_taken)]
# Generate some additional columns for each record
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
1: 1 0 2002-10-06 1
2: 1 1 2003-07-21 2
3: 1 1 2003-10-15 3
4: 1 0 2005-07-21 4
5: 1 1 2014-08-22 5
6: 1 1 2015-09-11 6
I want to track the number of times each student failed an exam, given that they failed the two previous exams (and all such combinations). I tried to do this with the data.table library in R:
# Create new columns that contain the previous exam results
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
my_data$prev_3_exam = shift(my_data$results, 3)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, results, prev_exam, prev_2_exam, prev_3_exam)]
out = na.omit(out)
> head(out)
id results prev_exam prev_2_exam prev_3_exam tally
1: 1 1 1 1 1 1
2: 1 0 1 1 1 3
3: 1 0 0 1 1 2
4: 1 1 0 0 1 1
5: 1 0 1 0 0 1
6: 1 1 0 1 0 1
Can someone please tell me if I have done this correctly? Have I correctly used the "shift()" function in data.table?
shift is being used correctly, but it's hard to tell what is going on once we get to my_grid$counts = as.integer(rnorm(8,10,5)).
One thing, though. The table should be filtered on !is.na(prev_3_exam) instead of !is.na(prev_exam).
Here is a function that uses a similar approach to return out and my_grid in a list for the lag specified as a parameter. It uses data.table grouping rather than a for loop.
f1 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
nms <- paste0("exam", 1:lag)
list(
out = dt2 <- copy(dt)[
,(nms) := shift(results, 1:lag) # see Frank's comment
][
id == shift(id, lag)
][
, .(tally = .N), by = c("id", "results", nms)
],
my_grid = setorderv(
dt2[
, {
counts <- sum(tally)
.(
counts = counts,
probability = sum(results*tally)/counts
)
}, nms
], rev(nms)
)
)
}
Output:
f1(dt, 3L)
#> $out
#> id results exam1 exam2 exam3 tally
#> 1: 1 0 0 1 1 1
#> 2: 1 1 0 0 1 1
#> 3: 1 0 1 0 0 1
#> 4: 1 1 0 1 0 2
#> 5: 1 1 1 0 1 2
#> ---
#> 57437: 10000 1 0 0 0 1
#> 57438: 10000 0 1 0 0 1
#> 57439: 10000 1 0 1 0 1
#> 57440: 10000 0 1 0 1 1
#> 57441: 10000 0 0 1 0 1
#>
#> $my_grid
#> exam1 exam2 exam3 counts probability
#> 1: 0 0 0 8836 0.4980761
#> 2: 1 0 0 8832 0.5005661
#> 3: 0 1 0 8684 0.4947029
#> 4: 1 1 0 8770 0.4976055
#> 5: 0 0 1 8792 0.5013649
#> 6: 1 0 1 8631 0.5070096
#> 7: 0 1 1 8806 0.5021576
#> 8: 1 1 1 8682 0.4997696
If only my_grid is needed, here is a function wrapping an Rcpp function that uses bit shifting to perform the aggregation in a single-pass for loop without creating the helper columns with shift. It will be very fast, and its speed will be only marginally affected by the value of lag.
Rcpp::cppFunction("
IntegerVector exam_contingency(const IntegerVector& id, const IntegerVector& result, const int& lag) {
const int n = id.size();
const int lag1 = lag + 1;
int comb = result(0);
int mask = ~(1 << lag1);
IntegerVector out(pow(2, lag1));
for (int i = 1; i < lag1; i++) comb = (comb << 1) + result(i);
out(comb) = id(lag) == id(0);
for (int i = lag1; i < n; i++) {
comb = ((comb << 1) + result(i)) & mask;
out(comb) += id(i - lag) == id(i);
}
return(out);
}
")
f2 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
m <- matrix(
exam_contingency(dt$id, dt$results, as.integer(lag)),
2^lag, 2, 1
)
rs <- rowSums(m)
cbind(
if (lag == 1L) {
data.frame(exam1 = 0:1)
} else {
setNames(
expand.grid(rep(list(0:1), lag)),
paste0("exam", 1:lag)
)
},
data.frame(counts = rs, probability = m[,2]/rs)
)
}
It gives the same output as f1's my_grid:
all.equal(f1(dt, 3L)$my_grid, setDT(f2(dt, 3L)))
#> [1] TRUE
Benchmarking:
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1(dt, 3) 8802.2 9706.70 11478.458 10394.00 12134.40 69630.0 100
#> f2(dt, 3) 971.2 1016.40 1179.404 1047.20 1108.65 7733.8 100
#> f2(dt, 10) 1181.3 1208.05 1256.333 1237.65 1302.40 1406.6 100
Data:
library(data.table)
(seed <- sample(.Machine$integer.max, 1))
#> [1] 1784920766
set.seed(seed)
dt <- data.table(
id = sample.int(10000, 100000, replace = TRUE),
results = sample(0:1, 100000, replace = TRUE),
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
)
setkey(dt, id, date_exam_taken)

Creating new columns with combinations of string patterns in R

I have a data frame - in which I have a column with a lengthy string separated by _. Now I am interested in counting the patterns and several possible combinations from the long string. In the use case I provided below, you can find that I would like to count the occurrence of events A and B but not anything else.
If A and B repeat like A_B or B_A alone or if they repeats itself n number of times, I want to count them and also if there are several occurrences of those combinations.
Example data frame:
participant <- c("A", "B", "C")
trial <- c(1,1,2)
string_pattern <- c("A_B_A_C_A_B", "B_A_B_A_C_D_A_B", "A_B_C_A_B")
df <- data.frame(participant, trial, string_pattern)
Expected output:
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1. A 1 A_B_A_C_A_B 2 1 1 0 0
2. B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3. C 2 A_B_C_A_B 2 0 0 0 0
My code:
revised_df <- df%>%
dplyr::mutate(A_B = stringr::str_count(string_pattern, "A_B"),
B_A = stringr::str_count(string_pattern, "B_A"),
B_A_B = string::str_count(string_pattern, "B_A_B"))
My approach gets complicated as the number of combinations increases. Hence, looking for a better solution.
You could write a function to solve this:
m <- function(s){
a <- seq(nchar(s)-1)
start <- rep(a, rev(a))
stop <- ave(start, start, FUN = \(x)seq_along(x)+x)
b <- substring(s, start, stop)
gsub('(?<=\\B)|(?=\\B)', '_', b, perl = TRUE)
}
n <- function(x){
names(x) <- x
a <- strsplit(gsub("_", '', gsub("_[^AB]+_", ':', x)), ':')
b <- t(table(stack(lapply(a, \(y)unlist(sapply(y, m))))))
data.frame(pattern=x, as.data.frame.matrix(b), row.names = NULL)
}
n(string_pattern)
pattern A_B A_B_A B_A B_A_B B_A_B_A
1 A_B_A_C_A_B 2 1 1 0 0
2 B_A_B_A_C_D_A_B 2 1 2 1 1
3 A_B_C_A_B 2 0 0 0 0
Try: This checks each string row for current column name
library(dplyr)
df |>
mutate(A_B = 0, B_A = 0, A_B_A = 0, B_A_B = 0, B_A_B_A = 0) |>
mutate(across(A_B:B_A_B_A, ~ str_count(string_pattern, cur_column())))
participant trial string_pattern A_B B_A A_B_A B_A_B B_A_B_A
1 A 1 A_B_A_C_A_B 2 1 1 0 0
2 B 1 B_A_B_A_C_D_A_B 2 2 1 1 1
3 C 2 A_B_C_A_B 2 0 0 0 0

Replicate rows based on 2 "times"-columns and set values to 1

I got a R data.frame like this:
Value EventX EventY
1 2 0
2 1 1
3 1 2
and I want to convert it to:
Value EventX EventY
1 1 0
1 1 0
2 1 0
2 0 1
3 1 0
3 0 1
3 0 1
I used rep() for now to replicate the row based on the sum of EventX + EventY, but I will get the numbers of the first table replicated too.
I could just set them to 1, but when I have values >0 for EventX and EventY, I want them be set like given in the example, so just setting them to 1 is wrong.
here is my minimal code:
z <- data.frame(df[rep(row.names(df), df$EventX + df$EventY), 1:3, drop = FALSE], row.names=NULL)
One way using apply is to repeat 1/0 values for each row.
do.call(rbind, apply(df, 1, function(x) cbind(Value = x[1],
rbind(data.frame(EventX = rep(1, x[2]), EventY = rep(0, x[2])),
data.frame(EventX = rep(0, x[3]), EventY = rep(1, x[3]))))))
# Value EventX EventY
#1 1 1 0
#2 1 1 0
#3 2 1 0
#4 2 0 1
#5 3 1 0
#6 3 0 1
#7 3 0 1
Or a similar tidyverse approach could be
library(tidyverse)
df %>%
group_split(row_number()) %>%
map_dfr(~cbind(Value = .$Value[1],
bind_rows(tibble(EventX = rep(1, .$EventX), EventY = rep(0, .$EventX)),
tibble(EventX = rep(0, .$EventY), EventY = rep(1, .$EventY)))))
Here's another base R solution:
z <- data.frame(Value = 1:3, EventX = c(2,1,1), EventY = c(0,1,2))
z2 <- rbind(data.frame(Value =rep(z$Value, z$EventX), EventX = rep(1, sum(z$EventX)), EventY = 0)
,data.frame(Value = rep(z$Value, z$EventY), EventX = 0, EventY = rep(1, sum(z$EventY))))
z2[order(z2$Value), ]
Also, since the EventX and EventY variables don't appear to be dependent on each other, here is a data.table solution that may help you approach the data.
data.table::rbindlist(lapply(z[, -1]
, function(x) data.frame(Value = rep(z$Value, x), Counts = rep(1, sum(x)))
)
, idcol = TRUE)
.id Value Counts
1: EventX 1 1
2: EventX 1 1
3: EventX 2 1
4: EventX 3 1
5: EventY 2 1
6: EventY 3 1
7: EventY 3 1
And here's a dplyr and tidyr route. I could have also used the EventY = 0 and EventX = 0 in the bind_rows() call but I didn't.
library(dplyr)
library(tidyr)
bind_rows(tibble(Value = rep(z$Value, z$EventX), EventX = rep(1, sum(z$EventX)))
,tibble(Value = rep(z$Value, z$EventY), EventY = rep(1, sum(z$EventY))))%>%
replace_na(list(EventY = 0, EventX = 0))

Efficietly repeat data.table in a list, sequentially replacing columns with the same names from another data.table in a loop

I have two data.tables:
x <- data.table(a = c(1, 2, 3, 4, 1), b = c(2, 3, 4, 1, 2), c = c(3, 4, 1, 2, 3))
y <- data.table(a = c(1, 0, 0, 0, 1), b = c(0, 1, 0, 0, 0), c = c(0, 0, 0, 0, 1))
What I am trying to achieve is to create a list of y with length of the number of its columns where every next column is replaced by the values of the same column in x. The desired result shall look like this:
[[1]]
a b c
1: 1 0 0
2: 2 1 0
3: 3 0 0
4: 4 0 0
5: 1 0 1
[[2]]
a b c
1: 1 2 0
2: 0 3 0
3: 0 4 0
4: 0 1 0
5: 1 2 1
[[3]]
a b c
1: 1 0 3
2: 0 1 4
3: 0 0 1
4: 0 0 2
5: 1 0 3
What I tried:
z <- lapply(names(x), function(i) {
x[ , i, with = FALSE]
})
w <- rep(list(y), ncol(y))
myfun <- function(obj1, obj2) {
cbind(obj1, obj2)
}
u <- Map(myfun, obj1 = z, obj2 = w)
u <- lapply(u, function(i) {
setcolorder(i[ , unique(names(i)), with = FALSE], names(x))
})
It gives me the desired result, but is very clumsy and requires too many step, hence, it is probably inefficient with larger data.tables. I would like to have it more in the data.table way. I tried something which I assumed would work:
lapply(names(x), function(i) {
y[ , (i) := x[ , i, with = FALSE]]
})
However, it returns the first list component empty and copies all the values of x into the next list components.
Can someone help?
Here, we may need a copy of the 'y' while creating the list 'w' instead of
w <- rep(list(y), ncol(y))
It is tempting to go for the below expression of rep. However, that have an issue in the w elements as these are pointing to the same location in memory
w <- rep(list(copy(x)), ncol(y))
The assignment (:=) by reference changes the column values in each loop because they reference to the same object in memory. In the first case, after the assignment, it changes 'y' too along with 'w' list elements. Second case, it can change only 'w' and leave 'y' because we copyied. To understand the behavior, do a set assignment in a for loop
for(j in seq_along(x)) {print(w[[j]][[j]])
set(w[[j]], i = NULL, j =j, x[[j]])
print("----")
print(w[[j]])
}
Inorder to avoid that, use replicate
w <- replicate(ncol(y), copy(y), simplify = FALSE)
and then do the for loop (after recreating the objects again as the values were replaced from the previous run)
for(j in seq_along(x)) {print(w[[j]][[j]])
set(w[[j]], i = NULL, j =j, x[[j]])
print("----")
print(w[[j]])
}
Or a Map based assignment
Map(function(u, v) u[, (v) := x[[v]]][], w, names(x))
#[[1]]
# a b c
#1: 1 0 0
#2: 2 1 0
#3: 3 0 0
#4: 4 0 0
#5: 1 0 1
#[[2]]
# a b c
#1: 1 2 0
#2: 0 3 0
#3: 0 4 0
#4: 0 1 0
#5: 1 2 1
#[[3]]
# a b c
#1: 1 0 3
#2: 0 1 4
#3: 0 0 1
#4: 0 0 2
#5: 1 0 3
Instead of assignment by reference, it can be done with a simple Map from base R if we have not copyied the 'y' object while creating 'w'
Map(function(u, v) {u[[v]] <- x[[v]]
u}, w, names(x))

data.table grouping by column names from a variable sets column names as "get"

I need to aggregate a data table by columns given as strings in variables. I'm using get to achieve this but the column names in the resulting table are named as "get" instead of the original names. How to avoid this?
dt = data.table(id = rep(LETTERS[1:4], 1, each = 3),
grp = round(runif(12)),
val = runif(12))
col.names = names(dt)
dt[, .(meanByIDByGrp = mean(val)), by = .(get(col.names[1]), get(col.names[2]))]
get get meanByIDByGrp
1: A 1 0.5628882
2: A 0 0.6021001
3: B 1 0.4013824
4: B 0 0.0551370
5: C 1 0.6031302
6: C 0 0.7107527
7: D 1 0.2778507
dt[, .(meanByIDByGrp = mean(val)), by = col.names[1:2]]
# id grp meanByIDByGrp
# 1: A 1 0.1638516
# 2: A 0 0.5859206
# 3: B 1 0.4907845
# 4: B 0 0.3665976
# 5: C 1 0.6644277
# 6: D 0 0.5028973

Resources