sample values by group with conditions - r

I have grouped data and I want to create a new variable value that will take the value 0 or 1.
Every group needs at least one observation where value==1.
But groups cannot have more than 2 observations where value==1.
Ideally I can set it so no more than 25% of groups only have one observation where value==1.
library(tidyverse)
set.seed(1)
# sample can break the rules
tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5))) %>%
group_by(group) %>%
mutate(value = sample(c(0, 1), n(), replace = TRUE, prob = c(0.8, 0.2)))

One solution would be to create a listing of your unique group labels and shuffle those (here I get the unique group labels via nest). Then depending on whether the group is in the first 25% of rows of the data frame, you can assign either a) a random number between 1 and 2, or b) always 2. Finally, you can use the assigned number to define how 0s and 1s should be sampled for each group, and then unnest the result.
set.seed(0)
result <- df %>%
nest(data = -group) %>%
.[sample(1:nrow(.), nrow(.)), ] %>% # shuffle the group order
mutate(
value_count = ifelse(row_number() / n() <= 0.25, sample(1:2, n(), replace = T), 2)
) %>%
rowwise() %>%
mutate(
count = nrow(data),
value = list(sample(c(rep(1, value_count), rep(0, count - value_count)), count))
) %>%
unnest(value) %>%
select(-data, -value_count, -count)
group value
<chr> <dbl>
1 B 0
2 B 0
3 B 0
4 B 0
5 B 1
6 B 0
7 A 1
8 A 1
9 A 0
10 D 1
11 D 0
12 D 1
13 D 0
14 D 0
15 C 1
16 C 0
17 C 0
18 C 1

Looks like I was beat to the punch, but here's another way to do it:
library(tidyverse)
set.seed(1)
# sample can break the rules
x <- tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5)))
# Make all 'var' =1, then set all but first of each group to 0.
xx <- x %>% group_by(group) %>%
mutate(var = row_number()) %>%
mutate(var = ifelse(var == 1, 1, 0))
pct_with_two <- .75 # percentage of groups with two 1's
samp_size <- floor(length(unique(xx$group)) * pct_with_two) #round down to whole number
addl_one <- sample(unique(xx$group), size = samp_size, replace = F)
xx %>%
mutate(var2 = case_when(
group %in% addl_one & row_number() == 2 ~ 1,
TRUE ~0)) %>%
mutate(var = var+var2) %>%
select(-var2)
#> # A tibble: 18 x 2
#> # Groups: group [4]
#> group var
#> <chr> <dbl>
#> 1 A 1
#> 2 A 1
#> 3 A 0
#> 4 B 1
#> 5 B 0
#> 6 B 0
#> 7 B 0
#> 8 B 0
#> 9 B 0
#> 10 C 1
#> 11 C 1
#> 12 C 0
#> 13 C 0
#> 14 D 1
#> 15 D 1
#> 16 D 0
#> 17 D 0
#> 18 D 0
Created on 2022-03-11 by the reprex package (v0.3.0)

Related

Create countdown and counter before and after a certain value

I want to create a variable that counts down a given number of steps before certain value and counts up a given number steps after the value.
In the example below, I want to have a counters before and after the grp == "b". When grp == "b" the value should be 0, and before and after the counters it should be 100.
To do this I can use case_when function as follows:
library(dplyr)
n<-10;k<-3
test_df<-data.frame(id=1:(k*n),grp=rep(letters[1:k],each=n))
before=5;after=3;index=2
test_df2<-test_df %>%
mutate(before_after=
case_when(
grp == letters[index] ~ 0
,dplyr::lag(grp,1) == letters[index] ~ 1
,dplyr::lag(grp,2) == letters[index] ~ 2
,dplyr::lag(grp,3) == letters[index] ~ 3
,dplyr::lead(grp,1) == letters[index] ~ -1
,dplyr::lead(grp,2) == letters[index] ~ -2
,dplyr::lead(grp,3) == letters[index] ~ -3
,dplyr::lead(grp,4) == letters[index] ~ -4
,dplyr::lead(grp,5) == letters[index] ~ -5
,TRUE~100
)
)
The problem is the variables "before" and "after" are not static! So I should make a function to do it as follows:
b_a<-function(before=15,after=6,others=100,grp_f=letters[2]){
}
How to make a function to do it?
Edit:
I created a function but if there us a better and easy solution please feel free to add an answer:
b_a<-unction(test_df,before=15,after=6,others=100,grp_f=letters[2]){
ltr<-grp_f
test_df2<- test_df %>%
group_by(grp) %>%
dplyr::mutate( fr=ifelse(grp==ltr,dplyr::first(id),0 )) %>%
ungroup() %>%
dplyr::mutate( fr=max(fr)) %>%
mutate(fr=-(fr-id)) %>%
mutate(fr=ifelse(fr<0,fr,0 ))
test_df2<- test_df2 %>%
group_by(grp) %>%
dplyr::mutate( lst=ifelse(grp==ltr,dplyr::last(id),0 )) %>%
ungroup() %>%
dplyr::mutate( lst=max(lst)) %>%
mutate(lst=-(lst-id)) %>%
mutate(lst=ifelse(lst>0,lst,0 ))
test_df2<- test_df2 %>% ungroup() %>%
mutate(before_after=lst+fr)
test_df2 %>% mutate(ifelse(before_after >= -before &
before_after <=after,before_after,others))
test_df2
}
This may help:
test_df3<-test_df %>%
group_by(grp) %>%
mutate(is_group_b=ifelse(grp==letters[index],1,0)) %>%
ungroup() %>%
mutate(nr=row_number()) %>%
mutate(nr=nr*is_group_b) %>%
mutate(max_nr=max(nr))%>%
mutate(nr=ifelse(nr==0,NA,nr)) %>%
mutate(min_nr=min(nr,na.rm = TRUE)) %>%
mutate(nr=ifelse(is.na(nr),0,nr)) %>%
mutate(vl=(row_number()-min_nr)) %>%
mutate(vl=vl*(1-is_group_b)) %>%
mutate(vl=if_else(vl>0,row_number()-max_nr,vl))
Try this:
library(dplyr)
test_df %>%
group_by(grp) %>%
mutate(value = ifelse(grp == letters[index-1] & row_number() <= 5, 100, (before:1)*-1),
value1 = ifelse(grp == letters[index+1] & row_number() > 3, 100, 1:after)) %>%
mutate(value = case_when(grp == letters[index] ~ 0,
grp == letters[index-1] ~ value,
grp == letters[index+1] ~ value1), .keep="unused") %>%
print(n=50)
id grp value
<int> <chr> <dbl>
1 1 a 100
2 2 a 100
3 3 a 100
4 4 a 100
5 5 a 100
6 6 a -5
7 7 a -4
8 8 a -3
9 9 a -2
10 10 a -1
11 11 b 0
12 12 b 0
13 13 b 0
14 14 b 0
15 15 b 0
16 16 b 0
17 17 b 0
18 18 b 0
19 19 b 0
20 20 b 0
21 21 c 1
22 22 c 2
23 23 c 3
24 24 c 100
25 25 c 100
26 26 c 100
27 27 c 100
28 28 c 100
29 29 c 100
30 30 c 100
Set result column to 100. Get indices of the focal group. Create replacement values (countdown - zeros - countup) of given lengths. Replace result at relevant indices.
d = data.frame(id = 1:12, grp = rep(letters[1:3], each = 4))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
d$x = replace(d$x, i[1] - before + 0:(length(v) - 1), v)
d
# id grp x
# 1 1 a 100
# 2 2 a 100
# 3 3 a -2
# 4 4 a -1
# 5 5 b 0
# 6 6 b 0
# 7 7 b 0
# 8 8 b 0
# 9 9 c 1
# 10 10 c 2
# 11 11 c 3
# 12 12 c 100
Depending on the data, you may want to add a check so that indices of before- and after-sequences are kept within the bounds of the data, e.g. using %in%:
d = data.frame(id = 1:8, grp = rep(letters[1:3], c(1, 5, 2)))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
i2 = i[1] - before + 0:(length(v) - 1)
ok = i2 %in% seq_len(nrow(d)) # <~ check if indices are within range of data
d$x = replace(d$x, i2[ok], v[ok])
d
# id grp x
# 1 1 a -1 # leading countdown truncated
# 2 2 b 0
# 3 3 b 0
# 4 4 b 0
# 5 5 b 0
# 6 6 b 0
# 7 7 c 1
# 8 8 c 2 # trailing counter truncated

Is there a way to select the first row within a group with different conditions in dplyr?

I want to select a row for each group created by variable a. It should be the row with the highest value for variable c, but if variable b is TRUE, then the row with b = TRUE and maximum c within that group should be selected.
I have the following code:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df %>% group_by(a) %>% filter(b == 1) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df1
df %>% group_by(a) %>% filter(all(b != 1)) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df2
df3 <- rbind(df1, df2)
This works, but I wonder if there is a simpler way to achieve the same.
You could filter the values for groups and then do your summarize.
df %>%
group_by(a) %>%
filter(all(b==0) | b==1) %>%
summarize(b = first(b), c = max(c))
# a b c
# <int> <dbl> <int>
# 1 1 0 8
# 2 2 1 5
# 3 3 1 9
So we only keep the values per group if b==1 or if all b==0
We can do it with ifelse inside summarise and without the need to filter b values.
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
cc <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
cc = cc)
df |>
group_by(a) |>
summarise(b = max(b),teste = ifelse(any(b == 1), max(cc[b == 1]), max(cc)) )
Also, never name something c in R.
library(data.table)
setDT(df)
# select the maximum c value, grouped by a and b
# then negative order by b (so rows with b == 1 get on top),
# and select the first row of each a-group
df[df[, .I[c == max(c)], by = .(a,b)]$V1][order(a,-b), .SD[1], by = a]
library(dplyr)
df %>% group_by(a) %>%
arrange(desc(b),desc(c), .by_group = T) %>%
slice_head(n = 1) %>%
ungroup()
#> # A tibble: 3 × 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 0 8
#> 2 2 1 5
#> 3 3 1 9
Input data:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df
#> a b c
#> 1 1 0 8
#> 2 1 0 7
#> 3 1 0 4
#> 4 2 0 1
#> 5 2 1 5
#> 6 2 1 2
#> 7 3 1 9
#> 8 3 1 3
#> 9 3 0 6
Created on 2023-01-30 with reprex v2.0.2

How to create a count table in R?

I have a dataframe like below, which contains the body lengths and the observed counts of species A and B at each station
> set.seed(10)
> df <- data.frame(
+ species = c(rep("A",4), rep("B",4)),
+ station = rep(1:2, 4),
+ length = round(rnorm(8, 15, 2)),
+ count = round(rnorm(8, 5, 2))
+ )
> df
species station length count
1 A 1 15 2
2 A 2 15 4
3 A 1 12 7
4 A 2 14 7
5 B 1 16 5
6 B 2 16 7
7 B 1 13 6
8 B 2 14 5
What I want to do is to transform this into a table with 2cm bins like this. But how can I get this?
> cnt_table <- data.frame(
+ species = c("A","A","B","B"),
+ station = c(1,2,1,2),
+ L11_12 = c(0,0,0,0),
+ L13_14 = c(7,7,6,5),
+ L15_16 = c(2,4,5,7),
+ L17_18 = c(0,0,0,0),
+ L19_20 = c(0,0,0,0)
+ )
> cnt_table
species station L11_12 L13_14 L15_16 L17_18 L19_20
1 A 1 0 7 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
First, you need to use cut to bin the lengths. Then you can use complete to fill the missing counts with 0. Then, group_by species, station and bin and use summarize to add the counts per group. Last, use pivot_wider to make the bins column labels.
Note 1: The result differs from your expected output, but I think you have a typo.
Note 2: I don't know if teh grouping and summing is necessary. In your example it's not, but logically I would include it.
library(tidyverse)
set.seed(10)
df <- data.frame(
species = c(rep("A",4), rep("B",4)),
station = rep(1:2, 4),
length = round(rnorm(8, 15, 2)),
count = round(rnorm(8, 5, 2))
)
df
#---------------------
df %>%
mutate(length = cut(length,
breaks = seq(10.5, 20.5, by = 2),
labels = c("L11_12", "L13_14", "L15_16", "L17_18", "L19_20"))) %>%
complete(species, station, length, fill = list(count = 0)) %>%
group_by(species, station, length) %>%
summarize(count = sum(count)) %>%
pivot_wider(names_from = length, values_from = count)
#---------------------
# A tibble: 4 x 7
# Groups: species, station [4]
species station L11_12 L13_14 L15_16 L17_18 L19_20
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 7 0 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
a data.table approach that should scale well
library(data.table)
# create alookup table with categories
lookup <- data.table(
name = paste0("L", seq(11, 19, 2), "_", seq(12,20, 2)),
from = seq(11, 19, 2),
to = seq(12,20, 2)
)
lookup
# join with a non-equi join
setDT(df)[lookup, name := i.name, on = .(length >= from, length <= to)]
# spreak to wide
dcast(df, species + station ~ name, value.var = "count", fun.aggregate = sum)
# species station L11_12 L13_14 L15_16
# 1: A 1 7 0 2
# 2: A 2 0 7 4
# 3: B 1 0 6 5
# 4: B 2 0 5 7
You can do something like:
library(dplyr)
library(tidyr)
# Create the bins
df$bin <- cut(df$length, breaks = seq(from = min(df$length-1), to = max(df$length)+1, by = 2))
# Get the data frame into shape
df |>
mutate(
bin_start = substr(bin, 2,3),
bin_end = as.numeric(substr(bin, 5,6))-1,
bin_name = paste0("L", bin_start, "_", bin_end)) |>
group_by(species, station, bin_name) |>
summarise(n = n()) |>
pivot_wider(
names_from = bin_name,
values_from = n,
values_fill = list(n = 0)
)
# A tibble: 4 x 5
# Groups: species, station [4]
# species station L11_12 L13_14 L15_16
# <chr> <int> <int> <int> <int>
# 1 A 1 1 1 0
# 2 A 2 0 2 0
# 3 B 1 1 0 1
# 4 B 2 0 1 1
Define your desired range r, i.e. 11:20 as shown in OP (length should be even!). Then match df$lengh with a helper-matrix m to get bins, calculate aggregated sums and reshape to wide format, e.g. using reshape2::dcast.
r <- 11:20; m <- matrix(r, ncol=2, byrow=TRUE)
transform(df, bin=factor(sapply(df$length, \(z) which(rowSums(z == m) > 0)),
levels=seq_along(x), labels=apply(m, 1, paste, collapse='_'))) |>
aggregate(count ~ bin + species + station, sum) |>
reshape2::dcast(species + station ~ bin, value.var='count', drop=FALSE, fill=0)
# species station 11_12 13_14 15_16 17_18 19_20
# 1 A 1 7 0 2 0 0
# 2 A 2 0 7 4 0 0
# 3 B 1 0 6 5 0 0
# 4 B 2 0 5 7 0 0

check tables exceeding certain values and count number of times exceed respective threshold by respective id and label

I have a dataframe df
df <- data.frame(id =c(1,2,1,4,1,5,6),
label=c("a","b", "a", "a","a", "e", "a"),
color = c("g","a","g","g","a","a","a"),
threshold = c(12, 10, 12, 12, 12, 35, 40),
value =c(32.1,0,15.0,10,1,50,45),stringsAsFactors = F
)
Threshold value is based on the label
I should get a table below like this by considering each id,with respective label how many times exceeding its threshold by the value
Color is independent in consideration for calculating the exceed values
I tried like this
final_df <- df %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
group_by(id, label) %>%
summarise(exceed = sum(check))
But instead of getting with respective id i have got the number of total in exceed
With base R only, use aggregate.
aggregate(seq.int(nrow(df)) ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
# id label seq.int(nrow(df))
#1 1 a 2
#2 4 a 0
#3 6 a 1
#4 2 b 0
#5 5 e 1
In order to match the expected output posted in the question, it will take a little extra work.
exceed <- seq.int(nrow(df))
agg <- aggregate(exceed ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
res <- merge(df[1:3], agg)
unique(res)
# id label color exceed
#1 1 a g 2
#3 1 a a 2
#4 2 b a 0
#5 4 a g 0
#6 5 e a 1
#7 6 a a 1
By a small modification of your code:
df %>%
group_by(id, label) %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
summarise(exceed = sum(check)) %>%
group_by(id, label)
id label exceed
<dbl> <chr> <dbl>
1 1 a 2
2 2 b 0
3 4 a 0
4 5 e 1
5 6 a 1
To match the expected output more closely:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(if_else(value > threshold, 1, 0))) %>%
group_by(id, label, color) %>%
filter(row_number() == 1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 1 a a 12 1 2
5 5 e a 35 50 1
6 6 a a 40 45 1
library(dplyr)
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 5 e a 35 50 1
5 6 a a 40 45 1
If you like the output to contain a separate row for each combination, of ID, label and color, just add a new group_by before the slice function:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
group_by(id, label, color) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a a 12 1 2
2 1 a g 12 32.1 2
3 2 b a 10 0 0
4 4 a g 12 10 0
5 5 e a 35 50 1
6 6 a a 40 45 1
A little change in your code
final_df <- df %>% mutate(check = if_else(value > threshold, 1, 0)) %>% group_by(id, label) %>% filter(check==1)
unique(final_df$id)
We could use table and merge :
table_ <- table(subset(df,value>threshold, c("id","label")))
df2 <- merge(unique(df[c("id","label","color")]),table_,all.x=TRUE)
df2$Freq[is.na(df2$Freq)] <- 0
# id label color Freq
# 1 1 a g 2
# 2 1 a a 2
# 3 2 b a 0
# 4 4 a g 0
# 5 5 e a 1
# 6 6 a a 1

Operations between groups with dplyr

I have a data frame as follow where I would like to group the data by grp and index and use group a as a reference to perform some simple calculations. I would like to subtract the variable value from other group from the values of group a.
df <- data.frame(grp = rep(letters[1:3], each = 2),
index = rep(1:2, times = 3),
value = seq(10, 60, length.out = 6))
df
## grp index value
## 1 a 1 10
## 2 a 2 20
## 3 b 1 30
## 4 b 2 40
## 5 c 1 50
## 6 c 2 60
The desired outpout would be like:
## grp index value
## 1 b 1 20
## 2 b 2 20
## 3 c 1 40
## 4 c 2 40
My guess is it will be something close to:
group_by(df, grp, index) %>%
mutate(diff = value - value[grp == "a"])
Ideally I would like to do it using dplyr.
Regards, Philippe
We can filter for 'grp' that are not 'a' and then do the difference within mutate.
df %>%
filter(grp!="a") %>%
mutate(value = value- df$value[df$grp=="a"])
Or another option would be join
df %>%
filter(grp!="a") %>%
left_join(., subset(df, grp=="a", select=-1), by = "index") %>%
mutate(value = value.x- value.y) %>%
select(1, 2, 5)
# grp index value
#1 b 1 20
#2 b 2 20
#3 c 1 40
#4 c 2 40

Resources