Sample rows of data for iteratively smaller samples - r

I have the following dataset:
group<- c(rep(1, 200), rep(2, 200), rep(3, 200), rep(4, 200), rep(5, 200), rep(6, 200))
injection<- c(rep(1, 200), rep(0, 600), rep(1, 400))
art_light<- c(rep(1, 400), rep(0, 600), rep(1, 200))
seasonal_light<- c(rep(1, 600), rep(0, 200), rep(1, 200), rep(1, 200))
## generate data frame for species 1-3; where Species B is unaffected by all of these things and A and C are subtly different
## species is a random effect and so results are more generalizable
dat1 <- data.frame(group, injection,art_light,seasonal_light)
dat1$species = "A"
dat2 <- data.frame(group, injection,art_light,seasonal_light)
dat2$species = "B"
dat3 <- data.frame(group, injection,art_light,seasonal_light)
dat3$species = "C"
#################################
# Simulated Response Variables #
#################################
alpha = 1
beta1 = 10
beta2 = 2
beta3 = 20
beta4 = 10
e1= rnorm(1200, 5, sd=1)
e2 = rlnorm(1200)
e = rcauchy(1200)
e3 = floor(runif(1200, min = 0, max = 20))
e4 = rpois(1200, lambda = 4)
e5 = rlnorm(1200)
dat1$lh<-alpha + beta1*injection + beta2*art_light +
beta3*seasonal_light + beta4*injection*seasonal_light + e1
dat1$hb<-alpha + beta1*injection + e2
dat2$lh<- e
dat2$hb<- alpha + beta3*injection + e3
dat3$lh<-alpha + beta1*injection + beta2*art_light +
beta3*seasonal_light +e4
dat3$hb<-alpha + beta2*injection + e5
dat <- do.call("rbind", list(dat1, dat2, dat3))
I want to randomly sample rows within each group with no replacement. Sampling will be done iteratively such that in the first iteration of sampling, the number of rows sampled are the number of rows in group (n) and in the next iteration the number of sampled rows would be n-1.
These results should be combined into a data frame. Each subsample will be distinguished by a new variable sample_num that represents the number of rows sampled. Below is an example of the result data frame for the first 4 samples. The actual result should continue for all possible subsample sizes down to sampling of 10 rows.
samp1<-
dat %>%
group_by(group) %>%
sample_n(size = 600)
samp1$sample_num<-600
samp2<-
dat %>%
group_by(group) %>%
sample_n(size = 599)
samp2$sample_num<-599
samp3<-
dat %>%
group_by(group) %>%
sample_n(size = 598)
samp3$sample_num<-598
samp4<-
dat %>%
group_by(group) %>%
sample_n(size = 597)
samp4$sample_num<-597
samp_dat<- rbind(samp1,samp2,samp3,samp4)

You can do:
library(dplyr)
library(purrr)
res <- map_df(set_names(600:10),
~ dat %>%
group_by(group) %>%
sample_n(size = .x),
.id = "sample_num")
Giving:
# A tibble: 1,081,530 x 8
# Groups: group [6]
sample_num group injection art_light seasonal_light species lh hb
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 600 1 1 1 1 B -403. 38
2 600 1 1 1 1 A 49.3 12.5
3 600 1 1 1 1 B 0.961 31
4 600 1 1 1 1 A 48.7 12.1
5 600 1 1 1 1 B -0.691 23
6 600 1 1 1 1 A 47.7 11.4
7 600 1 1 1 1 C 37 3.55
8 600 1 1 1 1 B -0.327 22
9 600 1 1 1 1 B -7.71 32
10 600 1 1 1 1 B 0.153 36
# ... with 1,081,520 more rows

data.table is incredibly fast at this sort of thing
library(data.table)
rbindlist(lapply(600:10, \(x) setDT(dat)[,.SD[sample(1:.N,x)], by=.(group)][,sample:=x]))
Output:
group injection art_light seasonal_light species lh hb sample
1: 1 1 1 1 C 35.000000 3.196606 600
2: 1 1 1 1 A 46.424639 12.210558 600
3: 1 1 1 1 C 33.000000 5.303823 600
4: 1 1 1 1 A 47.316622 11.814838 600
5: 1 1 1 1 C 39.000000 3.769120 600
---
1081526: 6 1 1 1 A 47.249496 11.360076 10
1081527: 6 1 1 1 B -3.188948 29.000000 10
1081528: 6 1 1 1 A 47.263460 12.062339 10
1081529: 6 1 1 1 C 38.000000 3.307954 10
1081530: 6 1 1 1 B -2.760421 35.000000 10

Related

How to create a count table in R?

I have a dataframe like below, which contains the body lengths and the observed counts of species A and B at each station
> set.seed(10)
> df <- data.frame(
+ species = c(rep("A",4), rep("B",4)),
+ station = rep(1:2, 4),
+ length = round(rnorm(8, 15, 2)),
+ count = round(rnorm(8, 5, 2))
+ )
> df
species station length count
1 A 1 15 2
2 A 2 15 4
3 A 1 12 7
4 A 2 14 7
5 B 1 16 5
6 B 2 16 7
7 B 1 13 6
8 B 2 14 5
What I want to do is to transform this into a table with 2cm bins like this. But how can I get this?
> cnt_table <- data.frame(
+ species = c("A","A","B","B"),
+ station = c(1,2,1,2),
+ L11_12 = c(0,0,0,0),
+ L13_14 = c(7,7,6,5),
+ L15_16 = c(2,4,5,7),
+ L17_18 = c(0,0,0,0),
+ L19_20 = c(0,0,0,0)
+ )
> cnt_table
species station L11_12 L13_14 L15_16 L17_18 L19_20
1 A 1 0 7 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
First, you need to use cut to bin the lengths. Then you can use complete to fill the missing counts with 0. Then, group_by species, station and bin and use summarize to add the counts per group. Last, use pivot_wider to make the bins column labels.
Note 1: The result differs from your expected output, but I think you have a typo.
Note 2: I don't know if teh grouping and summing is necessary. In your example it's not, but logically I would include it.
library(tidyverse)
set.seed(10)
df <- data.frame(
species = c(rep("A",4), rep("B",4)),
station = rep(1:2, 4),
length = round(rnorm(8, 15, 2)),
count = round(rnorm(8, 5, 2))
)
df
#---------------------
df %>%
mutate(length = cut(length,
breaks = seq(10.5, 20.5, by = 2),
labels = c("L11_12", "L13_14", "L15_16", "L17_18", "L19_20"))) %>%
complete(species, station, length, fill = list(count = 0)) %>%
group_by(species, station, length) %>%
summarize(count = sum(count)) %>%
pivot_wider(names_from = length, values_from = count)
#---------------------
# A tibble: 4 x 7
# Groups: species, station [4]
species station L11_12 L13_14 L15_16 L17_18 L19_20
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 7 0 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
a data.table approach that should scale well
library(data.table)
# create alookup table with categories
lookup <- data.table(
name = paste0("L", seq(11, 19, 2), "_", seq(12,20, 2)),
from = seq(11, 19, 2),
to = seq(12,20, 2)
)
lookup
# join with a non-equi join
setDT(df)[lookup, name := i.name, on = .(length >= from, length <= to)]
# spreak to wide
dcast(df, species + station ~ name, value.var = "count", fun.aggregate = sum)
# species station L11_12 L13_14 L15_16
# 1: A 1 7 0 2
# 2: A 2 0 7 4
# 3: B 1 0 6 5
# 4: B 2 0 5 7
You can do something like:
library(dplyr)
library(tidyr)
# Create the bins
df$bin <- cut(df$length, breaks = seq(from = min(df$length-1), to = max(df$length)+1, by = 2))
# Get the data frame into shape
df |>
mutate(
bin_start = substr(bin, 2,3),
bin_end = as.numeric(substr(bin, 5,6))-1,
bin_name = paste0("L", bin_start, "_", bin_end)) |>
group_by(species, station, bin_name) |>
summarise(n = n()) |>
pivot_wider(
names_from = bin_name,
values_from = n,
values_fill = list(n = 0)
)
# A tibble: 4 x 5
# Groups: species, station [4]
# species station L11_12 L13_14 L15_16
# <chr> <int> <int> <int> <int>
# 1 A 1 1 1 0
# 2 A 2 0 2 0
# 3 B 1 1 0 1
# 4 B 2 0 1 1
Define your desired range r, i.e. 11:20 as shown in OP (length should be even!). Then match df$lengh with a helper-matrix m to get bins, calculate aggregated sums and reshape to wide format, e.g. using reshape2::dcast.
r <- 11:20; m <- matrix(r, ncol=2, byrow=TRUE)
transform(df, bin=factor(sapply(df$length, \(z) which(rowSums(z == m) > 0)),
levels=seq_along(x), labels=apply(m, 1, paste, collapse='_'))) |>
aggregate(count ~ bin + species + station, sum) |>
reshape2::dcast(species + station ~ bin, value.var='count', drop=FALSE, fill=0)
# species station 11_12 13_14 15_16 17_18 19_20
# 1 A 1 7 0 2 0 0
# 2 A 2 0 7 4 0 0
# 3 B 1 0 6 5 0 0
# 4 B 2 0 5 7 0 0

transform count table into disaggregated table of observations

I have data in the form of a count table of successes and trials, but for modeling I need these data in a disaggregated trial-level table.
How do I get from this:
dplyr::tibble(
user_id = c(1,2),
success = c(3,4),
trials = c(9, 10)
)
To this:
dplyr::tibble(
user_id = c(rep(1, 9), rep(2, 10)),
success = c(rep(1, 3),rep(0, 6), rep(1, 4), rep(0, 6))
)
We can uncount based on the 'trials', then grouped by 'user_id', change the 'success' to binary by creating a logical condition with row_number
library(dplyr)
library(tidyr)
df1 %>%
uncount(trials) %>%
group_by(user_id) %>%
mutate(success = +(row_number() <= first(success))) %>%
ungroup
# A tibble: 19 x 2
# user_id success
# <dbl> <int>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 0
# 5 1 0
# 6 1 0
# 7 1 0
# 8 1 0
# 9 1 0
#10 2 1
#11 2 1
#12 2 1
#13 2 1
#14 2 0
#15 2 0
#16 2 0
#17 2 0
#18 2 0
#19 2 0
Or with base R using Map and stack
stack(setNames(Map(function(x, y) rep(1:0, c(x, y)),
df1$success, df1$trials - df1$success), df1$user_id))[2:1]

R: refer to var in different list element on a looped condition

I've got a dataset with a number of vars (t01-t05 in a dummy example but many more in the real dataset). I calculate pred variable as a proportion of target == 1/n() per all group-level combinations (5th element in the ns_by_group_list). However, if the total number of people in that combination (s var) less than 6, I need to use the pred value from the equivalent t01-t04 combination (4th element of ns_by_group_list). If this one is less than 6, then from t01-t03 combinations (3rd element of ns_by_group_list), etc. The final output should look like ns_by_group_list[[5]] but with pred values coming from different ns_by_group_list list elements.
I was thinking of renaming pred and s vars in different list elements to pred1, pred2, .. pred5 and then pulling it all together to one data.frame, then create a long case_when statement... But surely there's a better/more elegant way to do it?
library(tibble)
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
## functions ####
create_t_labels <- function(n) {
paste0('t', str_pad(1:n, 2, 'left', '0'))
}
ns_by_group <- function(group_vars) {
input %>%
group_by_at(.vars = vars(group_vars)) %>%
summarise(n = n()) %>% # total number of people in each group
ungroup() %>%
spread(key = target, value = n) %>%
mutate(`0` = replace_na(`0`, 0),
n = replace_na(`1`, 0),
s = n + `0`,
pred = round(n/s, 3)
) %>%
select(-c(`1`, `0`))
}
### input data ####
set.seed(1)
input <- tibble(
target = sample(0:1, 50, replace = TRUE),
t01 = sample(1:3, 50, replace = TRUE),
t02 = rep(1:2, each = 25),
t03 = rep(1:5, each = 10),
t04 = rep(1, 50),
t05 = rep(1:2, each = 25)
)
## calculations ####
group_combo_list <- map(1:5, create_t_labels)
group_combo_list <- map(group_combo_list, function(x) c(x, 'target'))
ns_by_group_list <- map(group_combo_list, ns_by_group)
Recursively joining and replacing:
reduce(
ns_by_group_list,
~ {
left_join(.y, .x, by = grep("^t\\d+$", names(.x), value = TRUE),
suffix = c("", ".replacement")) %>%
mutate(pred = if_else(s < 6, pred.replacement, pred),
s = if_else(s < 6, s.replacement, s)) %>%
select(-ends_with(".replacement"))
},
.dir = "backward"
)
# # A tibble: 16 x 8
# t01 t02 t03 t04 t05 n s pred
# <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl>
# 1 1 1 1 1 1 1 16 0.562
# 2 1 1 2 1 1 1 16 0.562
# 3 1 2 3 1 2 2 12 0.583
# 4 1 2 4 1 2 4 6 0.667
# 5 1 2 5 1 2 1 12 0.583
# 6 2 1 1 1 1 3 13 0.385
# 7 2 1 2 1 1 2 6 0.333
# 8 2 1 3 1 1 0 13 0.385
# 9 2 2 4 1 2 1 6 0.5
# 10 2 2 5 1 2 2 6 0.5
# 11 3 1 1 1 1 0 8 0.125
# 12 3 1 2 1 1 1 8 0.125
# 13 3 1 3 1 1 0 8 0.125
# 14 3 2 3 1 2 0 7 0.714
# 15 3 2 4 1 2 1 7 0.714
# 16 3 2 5 1 2 4 7 0.714

check tables exceeding certain values and count number of times exceed respective threshold by respective id and label

I have a dataframe df
df <- data.frame(id =c(1,2,1,4,1,5,6),
label=c("a","b", "a", "a","a", "e", "a"),
color = c("g","a","g","g","a","a","a"),
threshold = c(12, 10, 12, 12, 12, 35, 40),
value =c(32.1,0,15.0,10,1,50,45),stringsAsFactors = F
)
Threshold value is based on the label
I should get a table below like this by considering each id,with respective label how many times exceeding its threshold by the value
Color is independent in consideration for calculating the exceed values
I tried like this
final_df <- df %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
group_by(id, label) %>%
summarise(exceed = sum(check))
But instead of getting with respective id i have got the number of total in exceed
With base R only, use aggregate.
aggregate(seq.int(nrow(df)) ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
# id label seq.int(nrow(df))
#1 1 a 2
#2 4 a 0
#3 6 a 1
#4 2 b 0
#5 5 e 1
In order to match the expected output posted in the question, it will take a little extra work.
exceed <- seq.int(nrow(df))
agg <- aggregate(exceed ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
res <- merge(df[1:3], agg)
unique(res)
# id label color exceed
#1 1 a g 2
#3 1 a a 2
#4 2 b a 0
#5 4 a g 0
#6 5 e a 1
#7 6 a a 1
By a small modification of your code:
df %>%
group_by(id, label) %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
summarise(exceed = sum(check)) %>%
group_by(id, label)
id label exceed
<dbl> <chr> <dbl>
1 1 a 2
2 2 b 0
3 4 a 0
4 5 e 1
5 6 a 1
To match the expected output more closely:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(if_else(value > threshold, 1, 0))) %>%
group_by(id, label, color) %>%
filter(row_number() == 1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 1 a a 12 1 2
5 5 e a 35 50 1
6 6 a a 40 45 1
library(dplyr)
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 5 e a 35 50 1
5 6 a a 40 45 1
If you like the output to contain a separate row for each combination, of ID, label and color, just add a new group_by before the slice function:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
group_by(id, label, color) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a a 12 1 2
2 1 a g 12 32.1 2
3 2 b a 10 0 0
4 4 a g 12 10 0
5 5 e a 35 50 1
6 6 a a 40 45 1
A little change in your code
final_df <- df %>% mutate(check = if_else(value > threshold, 1, 0)) %>% group_by(id, label) %>% filter(check==1)
unique(final_df$id)
We could use table and merge :
table_ <- table(subset(df,value>threshold, c("id","label")))
df2 <- merge(unique(df[c("id","label","color")]),table_,all.x=TRUE)
df2$Freq[is.na(df2$Freq)] <- 0
# id label color Freq
# 1 1 a g 2
# 2 1 a a 2
# 3 2 b a 0
# 4 4 a g 0
# 5 5 e a 1
# 6 6 a a 1

bind_rows to each group of tibble

Consider the following two tibbles:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value = 1:6)
So a and b have the same columns and b has an additional column called id.
I want to do the following: group b by id and then add tibble a on top of each group.
So the output should look like this:
# A tibble: 10 x 3
id time value
<chr> <int> <int>
1 a -1 100
2 a 0 200
3 a 1 1
4 a 2 2
5 a 3 3
6 b -1 100
7 b 0 200
8 b 1 4
9 b 2 5
10 b 3 6
Of course there are multiple workarounds to achieve this (like loops for example). But in my case I have a large number of IDs and a very large number of columns.
I would be thankful if anyone could point me towards the direction of a solution within the tidyverse.
Thank you
We can expand the data frame a with id from b and then bind_rows them together.
library(tidyverse)
a2 <- expand(a, id = b$id, nesting(time, value))
b2 <- bind_rows(a2, b) %>% arrange(id, time)
b2
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
split from base R will divide a data frame into a list of subsets based on an index.
b %>%
split(b[["id"]]) %>%
lapply(bind_rows, a) %>%
lapply(select, -"id") %>%
bind_rows(.id = "id")
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a 1 1
# 2 a 2 2
# 3 a 3 3
# 4 a -1 100
# 5 a 0 200
# 6 b 1 4
# 7 b 2 5
# 8 b 3 6
# 9 b -1 100
# 10 b 0 200
An idea (via base R) is to split your data frame and create a new one with id + the other data frame and rbind, i.e.
df = do.call(rbind, lapply(split(b, b$id), function(i)rbind(data.frame(id = i$id[1], a), i)))
which gives
id time value
a.1 a -1 100
a.2 a 0 200
a.3 a 1 1
a.4 a 2 2
a.5 a 3 3
b.1 b -1 100
b.2 b 0 200
b.3 b 1 4
b.4 b 2 5
b.5 b 3 6
NOTE: You can remove the rownames by simply calling rownames(df) <- NULL
We can nest and add the relevant rows to each nested item :
library(tidyverse)
b %>%
nest(-id) %>%
mutate(data= map(data,~bind_rows(a,.x))) %>%
unnest
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
Maybe not the most efficient way, but easy to follow:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value =
1:6)
a.a <- a %>% add_column(id = rep("a",length(a)))
a.b <- a %>% add_column(id = rep("b",length(a)))
joint <- bind_rows(b,a.a,a.b)
(joint <- arrange(joint,id))

Resources