R- random sample of groups in a data.table - r

How can I randomly sample e.g. three groups within a data.table so that the result contains three groups with all rows from the original data.table?
library(data.table)
dat <- data.table(ids=1:20, groups=sample(x=c("A","B","C", "D", "E", "F"), 20,
replace=TRUE))
I know how to select 10 rows randomly from a data.table:
dat.sampl1 <- as.data.table(sapply(dat[], sample, 10))
And also how to sample by group
dat[,.SD[sample(.N, min(.N,3))], by = groups]
But how to randomly sample groups? So the result should look like:
ids groups
1 F
11 F
3 F
18 F
8 A
9 A
10 A
17 A
19 A
12 E
14 E
16 E

Do you mean something like:
set.seed(123)
dat <- data.table(ids=1:20, groups=sample(x=c("A","B","C", "D", "E", "F"), 20,
replace=TRUE))
dat[groups %in% sample(unique(dat[, groups]), size = 3)][order(groups)]
# ids groups
# 1: 3 C
# 2: 10 C
# 3: 12 C
# 4: 7 D
# 5: 9 D
# 6: 14 D
# 7: 4 F
# 8: 5 F
# 9: 8 F
# 10: 11 F
# 11: 16 F
# 12: 20 F
If you want to sample groups with replacement, you can do the following, where A has been sampled twice:
dat[unique(dat[, list(groups)])[sample(.N, 3, replace = TRUE)], on = "groups"]
# ids groups
# 1: 3 C
# 2: 10 C
# 3: 12 C
# 4: 6 A
# 5: 15 A
# 6: 18 A
# 7: 6 A
# 8: 15 A
# 9: 18 A

This code works, using a single line of base R code using %in% to check an index which is generated using the sample function:
df1[df1[,'groups'] %in% sample(unique(df1[,'groups']), size = 3, replace = F), ]
For example:
> df1 <- data.frame("ids" = 1:20, "groups" = sample(LETTERS[1:4], size = 20, replace = T))
> df2 <- df1[df1[,'groups'] %in% sample(unique(df1[,'groups']), size = 3, replace = F), ]
> df2[order(df2[,'groups']),]
ids groups
4 4 B
6 6 B
18 18 B
20 20 B
1 1 C
2 2 C
3 3 C
9 9 C
12 12 C
16 16 C
19 19 C
7 7 D
11 11 D

Related

R, arguments imply differing number of row

I generated a data frame (df) in R (see below). If I use the column "x2" instead of "x2a" to make the data frame everything works well. However, as soon as I use "x2a" instead of "x2" I get an error because the input of "x2a" is of various lengths. Do you have an idea how I can change the code that it is going to work with column "x2a"?
Error message with "x2a":
Error in data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), :
arguments imply differing number of rows: 3, 16
Code to reproduce the data frame and error
x1 <- c("A", "B", "C")
x2 <- c("[1,3,5,6,7]","[5,7,8,9,10]","[3,4,5,8,9]")
x2a <- c("[1,3,5]","[5,7,8,9,10, 20, 30, 24]","[3,4,5,8,9]")
x3 <- c(8000, 74555, 623334)
x4 <- c(9000, 76000, 623500)
df <- data.frame(cbind(x1, x2a, x3, x4))
colnames(df) <- c("Id", "Noise", "Start", "End")
df$Start <- as.numeric(as.character(df$Start))
df$End <- as.numeric(as.character(df$End))
# remove square brackets
df$Noise <- gsub("\\[|\\]", "", df$Noise)
# split
split_it <- strsplit(df$Noise, split = ",")
df_2 <- data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), End = rep(df$End))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
base R
What I'm inferring you want is not something R can "intuit" for you: you want it to repeat the values in Id based on the number of elements found when strsplit did its work. (How should R know to look in one object and arbitrarily repeat another?)
Try using rep(., times=.) to specify how many times each element of Id (etc) should be repeated in order to stay "in step" with Noise.
# split
split_it <- strsplit(df$Noise, split = ",")
n <- lengths(split_it)
print(n)
# [1] 3 8 5
df_2 <- data.frame(Id = rep(df$Id, times=n),
Noise = unlist(split_it),
Start = rep(df$Start, times=n),
End = rep(df$End, times=n))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
df_2
# Id Noise Start End
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500
dplyr
library(dplyr)
df %>%
mutate(Noise = strsplit(Noise, split = ",")) %>%
unnest(Noise) %>%
mutate(Noise = as.integer(Noise)) # I'm inferring this is desired, not required
# # A tibble: 16 x 4
# Id Noise Start End
# <chr> <int> <dbl> <dbl>
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500

Find overlap of multiple ranges in data.table

I would like to find the overlapping part of multiple ranges which are given rowise in a data.table object.
An example would be:
t <- data.table(a = c(3,4,5), b = c(13,12,19))
So we have the ranges:
3 - 13,
4 - 12,
5 - 19
Hence the overlapping range would be:
5 - 12
In case of an additional range 19 - 22 the overlap should return NA - NA or 0 - 0 since there is no overlap.
I found solutions for similar problems like spatstat.utils:: intersect.ranges(). However this works only on two vectors and is hard to implement in a data.table
DT[,.(o.l = function()[1], o.r = function()[2], by=.()]
manner which I would really like to do if possible,..
As output for this example I would like to have:
t <- data.table(a = c(3,4,5), b = c(13,12,19), o.l = c(5,5,5), o.r = c(12,12,12))
Here's a one-line example:
library(data.table)
dt = data.table(a = c(3,4,5), b = c(13,12,19))
dt[, c("o.l", "o.r") := as.list(range(Reduce(intersect, mapply(seq, a, b, 1))))]
dt
# a b o.l o.r
# 1: 3 13 5 12
# 2: 4 12 5 12
# 3: 5 19 5 12
Where the core of the problem is
dt = data.table(a = c(3,4,5), b = c(13,12,19))
dt[, Reduce(intersect, mapply(seq, a, b, 1))]
# [1] 5 6 7 8 9 10 11 12
Borrowing idea from David Aurenburg answer in How to flatten / merge overlapping time periods, here is another possible approach:
DT[, g := c(0L, cumsum(shift(a, -1L) >= cummax(b))[-.N])][,
c("ol", "or") := .(max(a), min(b)), g]
data:
DT <- data.table(a = c(3,4,5,19,20,24), b = c(13,12,19,22,23,25))
output:
a b g ol or
1: 3 13 0 5 12
2: 4 12 0 5 12
3: 5 19 0 5 12
4: 19 22 1 20 22
5: 20 23 1 20 22
6: 24 25 2 24 25

conditional sampling without replacement

I am attempting to write a simulation that involves randomly re-assigning items to categories with some restrictions.
Lets say I have a collection of pebbles 1 to N distributed across buckets A through J:
set.seed(100)
df1 <- data.frame(pebble = 1:100,
bucket = sample(LETTERS[1:10], 100, T),
stringsAsFactors = F)
head(df1)
#> pebble bucket
#> 1 1 D
#> 2 2 C
#> 3 3 F
#> 4 4 A
#> 5 5 E
#> 6 6 E
I want to randomly re-assign pebbles to buckets. Without restrictions I could do it like so:
random.permutation.df1 <- data.frame(pebble = df1$pebble, bucket = sample(df1$bucket))
colSums(table(random.permutation.df1))
#> A B C D E F G H I J
#> 4 7 13 14 12 11 11 10 9 9
colSums(table(df1))
#> A B C D E F G H I J
#> 4 7 13 14 12 11 11 10 9 9
Importantly this re-assigns pebbles while ensuring that each bucket retains the same number (because we are sampling without replacement).
However, I have a set of restrictions such that certain pebbles cannot be assigned to certain buckets. I encode the restrictions in df2:
df2 <- data.frame(pebble = sample(1:100, 10),
bucket = sample(LETTERS[1:10], 10, T),
stringsAsFactors = F)
df2
#> pebble bucket
#> 1 33 I
#> 2 39 I
#> 3 5 A
#> 4 36 C
#> 5 55 J
#> 6 66 A
#> 7 92 J
#> 8 95 H
#> 9 2 C
#> 10 49 I
The logic here is that pebbles 33 and 39 cannot be placed in bucket I, or pebble 5 in bucket A, etc. I would like to permute which pebbles are in which bucket subject to these restrictions.
So far, I've thought of tackling it in a loop as below, but this does not result in buckets retaining the same number of pebbles:
perms <- character(0)
cnt <- 1
for (p in df1$pebble) {
perms[cnt] <- sample(df1$bucket[!df1$bucket %in% df2$bucket[df2$pebble==p]], 1)
cnt <- cnt + 1
}
table(perms)
#> perms
#> A B C D E F G H I J
#> 6 7 12 22 15 1 14 7 7 9
I then tried sampling positions, and then removing that position from the available buckets and the available remaining positions. This is also not working, and I suspect it is because I am sampling my way into branches of the tree that do not yield solutions.
set.seed(42)
perms <- character(0)
cnt <- 1
ids <- 1:nrow(df1)
bckts <- df1$bucket
for (p in df1$pebble) {
id <- sample(ids[!bckts %in% df2$bucket[df2$pebble==p]], 1)
perms[cnt] <- bckts[id]
bckts <- bckts[-id]
ids <- ids[ids!=id]
cnt <- cnt + 1
}
table(perms)
#> perms
#> A B C D E F G J
#> 1 1 4 1 2 1 2 2
Any thoughts or advice much appreciated (and apologies for the length).
EDIT:
I foolishly forgot to clarify that I was previously solving this by just resampling until I got a draw that didn't violate any of the conditions in df2, but I now have many conditions such that this would make my code take too long to run. I am still up for trying to force it if I could figure out a way to make forcing it faster.
I have a solution (I managed to write it in base R, but the data.table solution is easier to understand and write:
random.permutation.df2 <- data.frame(pebble = df1$pebble, bucket = rep(NA,length(df1$pebble)))
for(bucket in unique(df1$bucket)){
N <- length( random.permutation.df2$bucket[is.na(random.permutation.df2$bucket) &
!random.permutation.df2$pebble %in% df2$pebble[df2$bucket == bucket] ] )
random.permutation.df2$bucket[is.na(random.permutation.df2$bucket) &
!random.permutation.df2$pebble %in% df2$pebble[df2$bucket == bucket] ] <-
sample(c(rep(bucket,sum(df1$bucket == bucket)),rep(NA,N-sum(df1$bucket == bucket))))
}
The idea is to sample the authorised peeble for each bucket: those that are not in df2, and those that are not already filled. You sample then a vector of the good length, choosing between NAs (for the following buckets values) and the value in the loop, and voilà.
Now easier to read with data.table
library(data.table)
random.permutation.df2 <- setDT(random.permutation.df2)
df2 <- setDT(df2)
for( bucketi in unique(df1$bucket)){
random.permutation.df2[is.na(bucket) & !pebble %in% df2[bucket == bucketi, pebble],
bucket := sample(c(rep(bucketi,sum(df1$bucket == bucket)),rep(NA,.N-sum(df1$bucket == bucket))))]
}
it has the two conditions
> colSums(table(df1))
A B C D E F G H I J
4 7 13 14 12 11 11 10 9 9
> colSums(table(random.permutation.df2))
A B C D E F G H I J
4 7 13 14 12 11 11 10 9 9
To verify that there isn't any contradiction with df2
> df2
pebble bucket
1: 37 D
2: 95 H
3: 90 C
4: 80 C
5: 31 D
6: 84 G
7: 76 I
8: 57 H
9: 7 E
10: 39 A
> random.permutation.df2[pebble %in% df2$pebble,.(pebble,bucket)]
pebble bucket
1: 7 D
2: 31 H
3: 37 J
4: 39 F
5: 57 B
6: 76 E
7: 80 F
8: 84 B
9: 90 H
10: 95 D
Here a brute force approach where one simply tries long enough until a valid solution is found:
set.seed(123)
df1 <- data.frame(pebble = 1:100,
bucket = sample(LETTERS[1:10], 100, T),
stringsAsFactors = F)
df2 <- data.frame(pebble = sample(1:100, 10),
bucket = sample(LETTERS[1:10], 10, T),
stringsAsFactors = F)
random.permutation.df1 <- data.frame(pebble = df1$pebble, bucket = sample(df1$bucket))
Random permutation does not match the condition, so try new ones:
merge(random.permutation.df1, df2)
#> pebble bucket
#> 1 60 J
while(TRUE) {
random.permutation.df1 <- data.frame(pebble = df1$pebble, bucket = sample(df1$bucket))
if(nrow(merge(random.permutation.df1, df2)) == 0)
break;
}
New permutation matches the condition:
merge(random.permutation.df1, df2)
#> [1] pebble bucket
#> <0 Zeilen> (oder row.names mit Länge 0)
colSums(table(random.permutation.df1))
#> A B C D E F G H I J
#> 7 12 11 9 14 7 11 11 11 7
colSums(table(df1))
#> A B C D E F G H I J
#> 7 12 11 9 14 7 11 11 11 7

Order factor levels in order of appearance in data set

I have a survey in which a unique ID must be assigned to questions. Some questions appear multiple times. This means that there is an extra layer of questions. In the sample data below only the first layer is included.
Question: how do I assign a unique index by order of appearance? The solution provided here works alphabetically. I can order the factors, but this defeats the purpose of doing it in R [there are many questions to sort].
library(data.table)
dt = data.table(question = c("C", "C", "A", "B", "B", "D"),
value = c(10,20,30,40,20,30))
dt[, idx := as.numeric(as.factor(question))]
gives:
question value idx
# 1: C 10 3
# 2: C 20 3
# 3: A 30 1
# 4: B 40 2
# 5: B 20 2
# 6: D 30 4
# but required is:
dt[, idx.required := c(1, 1, 2, 3, 3, 4)]
I think the data.table way to do this will be
dt[, idx := .GRP, by = question]
## question value idx
## 1: C 10 1
## 2: C 20 1
## 3: A 30 2
## 4: B 40 3
## 5: B 20 3
## 6: D 30 4
You could respecify the factor levels:
dt[, idx := as.numeric(factor(question, levels=unique(question)))]
# question value idx
# 1: C 10 1
# 2: C 20 1
# 3: A 30 2
# 4: B 40 3
# 5: B 20 3
# 6: D 30 4

change data.frame column into rows in R

A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
I would like to have this:
df
1 2
A 1 6
B 2 7
C 3 8
D 4 9
E 5 0
If your dataframe is truly in that format, then all of your vectors will be character vectors. Or, you basically have a character matrix and you could do this:
data.frame(t(df))
It would be better, though, to just define it the way you want it from the get-go
df <- data.frame(c('A','B','C','D','E'),
c(1, 2, 3, 4, 5),
c(6, 7, 8, 9, 0))
You could also do this
df <- data.frame(LETTERS[1:5], 1:5, c(6:9, 0))
If you wanted to give the columns names, you could do this
df <- data.frame(L = LETTERS[1:5], N1 = 1:5, N2 = c(6:9, 0))
Sometimes, if I use read.DIF of Excel data the data gets transposed. Is that how you got the original data in? If so, you can call
read.DIF(filename, transpose = T)
to get the data in the correct orientation.
I really recommend data.table approach without manual steps becauce they are error-prone
A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
library('data.table')
dat.m <- melt(as.data.table(df, keep.rownames = "Vars"), id.vars = "Vars") # https://stackoverflow.com/a/44128640/54964
dat.m
Output
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
Vars variable value
1: 1 A 1
2: 2 A 6
3: 1 B 2
4: 2 B 7
5: 1 C 3
6: 2 C 8
7: 1 D 4
8: 2 D 9
9: 1 E 5
10: 2 E 0
R: 3.4.0 (backports)
OS: Debian 8.7

Resources