I got a R data.frame like this:
Value EventX EventY
1 2 0
2 1 1
3 1 2
and I want to convert it to:
Value EventX EventY
1 1 0
1 1 0
2 1 0
2 0 1
3 1 0
3 0 1
3 0 1
I used rep() for now to replicate the row based on the sum of EventX + EventY, but I will get the numbers of the first table replicated too.
I could just set them to 1, but when I have values >0 for EventX and EventY, I want them be set like given in the example, so just setting them to 1 is wrong.
here is my minimal code:
z <- data.frame(df[rep(row.names(df), df$EventX + df$EventY), 1:3, drop = FALSE], row.names=NULL)
One way using apply is to repeat 1/0 values for each row.
do.call(rbind, apply(df, 1, function(x) cbind(Value = x[1],
rbind(data.frame(EventX = rep(1, x[2]), EventY = rep(0, x[2])),
data.frame(EventX = rep(0, x[3]), EventY = rep(1, x[3]))))))
# Value EventX EventY
#1 1 1 0
#2 1 1 0
#3 2 1 0
#4 2 0 1
#5 3 1 0
#6 3 0 1
#7 3 0 1
Or a similar tidyverse approach could be
library(tidyverse)
df %>%
group_split(row_number()) %>%
map_dfr(~cbind(Value = .$Value[1],
bind_rows(tibble(EventX = rep(1, .$EventX), EventY = rep(0, .$EventX)),
tibble(EventX = rep(0, .$EventY), EventY = rep(1, .$EventY)))))
Here's another base R solution:
z <- data.frame(Value = 1:3, EventX = c(2,1,1), EventY = c(0,1,2))
z2 <- rbind(data.frame(Value =rep(z$Value, z$EventX), EventX = rep(1, sum(z$EventX)), EventY = 0)
,data.frame(Value = rep(z$Value, z$EventY), EventX = 0, EventY = rep(1, sum(z$EventY))))
z2[order(z2$Value), ]
Also, since the EventX and EventY variables don't appear to be dependent on each other, here is a data.table solution that may help you approach the data.
data.table::rbindlist(lapply(z[, -1]
, function(x) data.frame(Value = rep(z$Value, x), Counts = rep(1, sum(x)))
)
, idcol = TRUE)
.id Value Counts
1: EventX 1 1
2: EventX 1 1
3: EventX 2 1
4: EventX 3 1
5: EventY 2 1
6: EventY 3 1
7: EventY 3 1
And here's a dplyr and tidyr route. I could have also used the EventY = 0 and EventX = 0 in the bind_rows() call but I didn't.
library(dplyr)
library(tidyr)
bind_rows(tibble(Value = rep(z$Value, z$EventX), EventX = rep(1, sum(z$EventX)))
,tibble(Value = rep(z$Value, z$EventY), EventY = rep(1, sum(z$EventY))))%>%
replace_na(list(EventY = 0, EventX = 0))
Related
I have this dataset in which students take an exam multiple times over a period of years - a "fail" is a 0 and a "pass" is a 1. The data looks something like this:
# Load the data.table package
library(data.table)
# Generate some sample data
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
# Create a data table from the sample data
my_data = data.table(id, results, date_exam_taken)
my_data <- my_data[order(id, date_exam_taken)]
# Generate some additional columns for each record
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
1: 1 0 2002-10-06 1
2: 1 1 2003-07-21 2
3: 1 1 2003-10-15 3
4: 1 0 2005-07-21 4
5: 1 1 2014-08-22 5
6: 1 1 2015-09-11 6
I want to track the number of times each student failed an exam, given that they failed the two previous exams (and all such combinations). I tried to do this with the data.table library in R:
# Create new columns that contain the previous exam results
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
my_data$prev_3_exam = shift(my_data$results, 3)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, results, prev_exam, prev_2_exam, prev_3_exam)]
out = na.omit(out)
> head(out)
id results prev_exam prev_2_exam prev_3_exam tally
1: 1 1 1 1 1 1
2: 1 0 1 1 1 3
3: 1 0 0 1 1 2
4: 1 1 0 0 1 1
5: 1 0 1 0 0 1
6: 1 1 0 1 0 1
Can someone please tell me if I have done this correctly? Have I correctly used the "shift()" function in data.table?
shift is being used correctly, but it's hard to tell what is going on once we get to my_grid$counts = as.integer(rnorm(8,10,5)).
One thing, though. The table should be filtered on !is.na(prev_3_exam) instead of !is.na(prev_exam).
Here is a function that uses a similar approach to return out and my_grid in a list for the lag specified as a parameter. It uses data.table grouping rather than a for loop.
f1 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
nms <- paste0("exam", 1:lag)
list(
out = dt2 <- copy(dt)[
,(nms) := shift(results, 1:lag) # see Frank's comment
][
id == shift(id, lag)
][
, .(tally = .N), by = c("id", "results", nms)
],
my_grid = setorderv(
dt2[
, {
counts <- sum(tally)
.(
counts = counts,
probability = sum(results*tally)/counts
)
}, nms
], rev(nms)
)
)
}
Output:
f1(dt, 3L)
#> $out
#> id results exam1 exam2 exam3 tally
#> 1: 1 0 0 1 1 1
#> 2: 1 1 0 0 1 1
#> 3: 1 0 1 0 0 1
#> 4: 1 1 0 1 0 2
#> 5: 1 1 1 0 1 2
#> ---
#> 57437: 10000 1 0 0 0 1
#> 57438: 10000 0 1 0 0 1
#> 57439: 10000 1 0 1 0 1
#> 57440: 10000 0 1 0 1 1
#> 57441: 10000 0 0 1 0 1
#>
#> $my_grid
#> exam1 exam2 exam3 counts probability
#> 1: 0 0 0 8836 0.4980761
#> 2: 1 0 0 8832 0.5005661
#> 3: 0 1 0 8684 0.4947029
#> 4: 1 1 0 8770 0.4976055
#> 5: 0 0 1 8792 0.5013649
#> 6: 1 0 1 8631 0.5070096
#> 7: 0 1 1 8806 0.5021576
#> 8: 1 1 1 8682 0.4997696
If only my_grid is needed, here is a function wrapping an Rcpp function that uses bit shifting to perform the aggregation in a single-pass for loop without creating the helper columns with shift. It will be very fast, and its speed will be only marginally affected by the value of lag.
Rcpp::cppFunction("
IntegerVector exam_contingency(const IntegerVector& id, const IntegerVector& result, const int& lag) {
const int n = id.size();
const int lag1 = lag + 1;
int comb = result(0);
int mask = ~(1 << lag1);
IntegerVector out(pow(2, lag1));
for (int i = 1; i < lag1; i++) comb = (comb << 1) + result(i);
out(comb) = id(lag) == id(0);
for (int i = lag1; i < n; i++) {
comb = ((comb << 1) + result(i)) & mask;
out(comb) += id(i - lag) == id(i);
}
return(out);
}
")
f2 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
m <- matrix(
exam_contingency(dt$id, dt$results, as.integer(lag)),
2^lag, 2, 1
)
rs <- rowSums(m)
cbind(
if (lag == 1L) {
data.frame(exam1 = 0:1)
} else {
setNames(
expand.grid(rep(list(0:1), lag)),
paste0("exam", 1:lag)
)
},
data.frame(counts = rs, probability = m[,2]/rs)
)
}
It gives the same output as f1's my_grid:
all.equal(f1(dt, 3L)$my_grid, setDT(f2(dt, 3L)))
#> [1] TRUE
Benchmarking:
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1(dt, 3) 8802.2 9706.70 11478.458 10394.00 12134.40 69630.0 100
#> f2(dt, 3) 971.2 1016.40 1179.404 1047.20 1108.65 7733.8 100
#> f2(dt, 10) 1181.3 1208.05 1256.333 1237.65 1302.40 1406.6 100
Data:
library(data.table)
(seed <- sample(.Machine$integer.max, 1))
#> [1] 1784920766
set.seed(seed)
dt <- data.table(
id = sample.int(10000, 100000, replace = TRUE),
results = sample(0:1, 100000, replace = TRUE),
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
)
setkey(dt, id, date_exam_taken)
I have got a data set that looks like this:
COMPANY DATABREACH CYBERBACKGROUND
A 1 2
B 0 2
C 0 1
D 0 2
E 1 1
F 1 2
G 0 2
H 0 2
I 0 2
J 0 2
No I want to create the following: 40% of the cases that the column DATABREACH has the value of 1, I want the value CYBERBACKGROUND to take the value of 2. I figure there must be some function to do this, but I cannot find it.
ind <- which(df$DATABREACH == 1)
ind <- ind[rbinom(length(ind), 1, prob = 0.4) > 0]
df$CYBERBACKGROUND[ind] <- 2
The above is a bit more efficient in that it only pulls randomness for as many as strictly required. If you aren't concerned (11000 doesn't seem too high), you can reduce that to
df$CYBERBACKGROUND <-
ifelse(df$DATABREACH == 1 & rbinom(nrow(df), 1, prob = 0.4) > 0,
2, df$CYBERBACKGROUND)
We may use
library(dplyr)
df1 <- df1 %>%
mutate(CYBERBACKGROUND = replace(CYBERBACKGROUND,
sample(which(DATABREACH == 0), sum(ceiling(sum(DATABREACH) * 0.4))), 2))
x<-c(0,1,1,0,1,1,1,0,1,1)
aaa<-data.frame(x)
How to insert a blank row before zero? When the first row is zeroļ¼do not add blank row. Thank you.
Result:
0
1
1
.
0
1
1
1
.
0
1
1
Below we used dot but you can replace "." with NA or "" or something else depending on what you want.
1) We can use Reduce and append:
Append <- function(x, y) append(x, ".", y - 1)
data.frame(x = Reduce(Append, setdiff(rev(which(aaa$x == 0)), 1), init = aaa$x))
2) gsub Another possibility is to convert to a character string, use gsub and convert back:
data.frame(x = strsplit(gsub("(.)0", "\\1.0", paste(aaa$x, collapse = "")), "")[[1]])
3) We can create a two row matrix in which the first row is dot before each 0 and NA otherwise. Then unravel it to a vector and use na.omit to remove the NA values.
data.frame(x = na.omit(c(rbind(replace(ifelse(aaa$x == 0, ".", NA), 1, NA), aaa$x))))
4) We can lapply over aaa$x[-1] outputting c(".", 9) or 1. Unlist that and insert aaa$x[1] back in. No packages are used.
repl <- function(x) if (!x) c(".", 0) else 1
data.frame(x = c(aaa$x[1], unlist(lapply(aaa$x[-1], repl))))
5) Create a list of all but the first element and replace the 0's in that list with c(".", 0) . Unlist that and insert the first element back in. No packages are used.
L <- as.list(aaa$x[-1])
L[x[-1] == 0] <- list(c(".", 0))
data.frame(x = c(aaa$x[1], unlist(L)))
6) Assuming aaa has two columns where the second column is character (NOT factor). Append a row of dots to aaa and then create an index vector using unlist and Map to access the appropriate row of the extended aaa.
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10],
stringsAsFactors = FALSE)
nr <- nrow(aaa); nc <- ncol(aaa)
fun <- function(ix, x) if (!is.na(x) & x == 0 & ix > 1) c(nr + 1, ix) else ix
rbind(aaa, rep(".", nc))[unlist(Map(fun, 1:nr, aaa$x)), ]
If we did want to have y be factor then note that we can't just add a dot to a factor if it is not a level of that factor so there is the question of what levels the factor can have. To get around that let us add an NA rather than a dot to the factor. Then we get the following which is the same except that aaa has been redefined so that y is a factor, we no longer need nc since we are assuming 2 columns and rep(...) in the last line is replaced with c(".", NA).
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10])
nr <- nrow(aaa)
fun <- function(ix, x) if (!is.na(x) & x == 0 & ix > 1) c(nr + 1, ix) else ix
rbind(aaa, c(".", NA))[unlist(Map(fun, 1:nr, aaa$x)), ]
One dplyr and tidyr possibility may be:
aaa %>%
uncount(ifelse(row_number() > 1 & x == 0, 2, 1)) %>%
mutate(x = ifelse(x == 0 & lag(x == 1, default = first(x)), NA_integer_, x))
x
1 0
2 1
3 1
4 NA
5 0
6 1
7 1
8 1
9 NA
10 0
11 1
12 1
It is not adding a blank row as you have a numeric vector. Instead, it is adding a row with NA. If you need a blank row, you can convert it into a character vector and then replace NA with blank.
ind = with(aaa, ifelse(x == 0 & seq_along(x) > 1, 2, 1))
d = aaa[rep(1:NROW(aaa), ind), , drop = FALSE]
transform(d, x = replace(x, sequence(ind) == 2, NA))
Here is an option with rleid
library(data.table)
setDT(aaa)[, .(x = if(x[.N] == 1) c(x, NA) else x), rleid(x)][-.N, .(x)]
# x
# 1: 0
# 2: 1
# 3: 1
# 4: NA
# 5: 0
# 6: 1
# 7: 1
# 8: 1
# 9: NA
#10: 0
#11: 1
#12: 1
data.frame(x = unname(unlist(by(aaa$x,cumsum(aaa==0),c,'.'))))
x
1 0
2 1
3 1
4 .
5 0
6 1
7 1
8 1
9 .
10 0
11 1
12 1
13 .
My solution is
aaa <- data.frame(x = c(0,1,1,0,1,1,1,0,1,1), y = letters[1:10])
aaa$ind = with(aaa, ifelse(x == 0 & seq_along(x) > 1, 2, 1))
aaa<-aaa[rep(1:nrow(aaa), aaa$ind), ,]
aaa[(aaa$ind== 2 & !grepl(".1",rownames(aaa))),]<-NA
aaa$ind<- NULL
aaa
x y
1 0 a
2 1 b
3 1 c
4 NA <NA>
4.1 0 d
5 1 e
6 1 f
7 1 g
8 NA <NA>
8.1 0 h
9 1 i
10 1 j
I have a table in the long format:
require(data.table)
sampleDT <- data.table(Old = c("A","B","A","B","A","B","A","B")
, New = c("A","A","B","B","A","A","B","B")
, Time = c(1,1,1,1,2,2,2,2)
, value1 = c(1,1,1,1,1,1,1,1)
, value2 = c(0,0,0,0,0,0,0,0))
print(sampleDT)
Old New Time value1 value2
1: A A 1 1 0
2: B A 1 1 0
3: A B 1 1 0
4: B B 1 1 0
5: A A 2 1 0
6: B A 2 1 0
7: A B 2 1 0
8: B B 2 1 0
I would like to convert to an array of 3 dimensions. Something like:
Basically, we would have columns "New, Old, Time" as our three dimensions.
And the value for each cell is an output of some sort of functions whose input are "value1, value2".
In this case, when Time = 1, the result is:
matrix(data = c(1, 1+0, 0, -0), nrow = 2, ncol = 2, byrow = FALSE)
[,1] [,2]
[1,] 1 0
[2,] 1 0
How to achieve it?
Memory usage and computing time are important considerations, as we're working on relatively big datasets.
Try xtabs():
sampleDT <- data.frame(Old = c("A","B","A","B","A","B","A","B"),
New = c("A","A","B","B","A","A","B","B"),
Time = c(1,1,1,1,2,2,2,2),
value1 = c(1,1,1,1,1,1,1,1),
value2 = c(0,0,0,0,0,0,0,0))
Value1 <- xtabs(value1 ~ Old + New + Time, sampleDT, drop = FALSE)
Value2 <- xtabs(value2 ~ Old + New + Time, sampleDT, drop = FALSE)
is.array(Value1)
is.array(Value2)
Value1[, 2,] <- 0 # Sets all second columns to zero for Value1
Value2[1,,] <- 0 # Idem with first row for Value2
Value2[2,2,] <- Value2[2,2,] * (-1)
Result <- Value1 + Value2
Result
, , Time = 1
New
Old A B
A 1 0
B 1 0
, , Time = 2
New
Old A B
A 1 0
B 1 0
Hope it helps.
I wanted to create a vector of counts if possible.
For example: I have a vector
x <- c(3, 0, 2, 0, 0)
How can I create a frequency vector for all integers between 0 and 3? Ideally I wanted to get a vector like this:
> 3 0 1 1
which gives me the counts of 0, 1, 2, and 3 respectively.
Much appreciated!
You can do
table(factor(x, levels=0:3))
Simply using table(x) is not enough.
Or with tabulate which is faster
tabulate(factor(x, levels = min(x):max(x)))
You can do this using rle (I made this in minutes, so sorry if it's not optimized enough).
x = c(3, 0, 2, 0, 0)
r = rle(x)
f = function(x) sum(r$lengths[r$values == x])
s = sapply(FUN = f, X = as.list(0:3))
data.frame(x = 0:3, freq = s)
#> data.frame(x = 0:3, freq = s)
# x freq
#1 0 3
#2 1 0
#3 2 1
#4 3 1
You can just use table():
a <- table(x)
a
x
#0 2 3
#3 1 1
Then you can subset it:
a[names(a)==0]
#0
#3
Or convert it into a data.frame if you're more comfortable working with that:
u<-as.data.frame(table(x))
u
# x Freq
#1 0 3
#2 2 1
#3 3 1
Edit 1:
For levels:
a<- as.data.frame(table(factor(x, levels=0:3)))