I have a dataframe like below, which contains the body lengths and the observed counts of species A and B at each station
> set.seed(10)
> df <- data.frame(
+ species = c(rep("A",4), rep("B",4)),
+ station = rep(1:2, 4),
+ length = round(rnorm(8, 15, 2)),
+ count = round(rnorm(8, 5, 2))
+ )
> df
species station length count
1 A 1 15 2
2 A 2 15 4
3 A 1 12 7
4 A 2 14 7
5 B 1 16 5
6 B 2 16 7
7 B 1 13 6
8 B 2 14 5
What I want to do is to transform this into a table with 2cm bins like this. But how can I get this?
> cnt_table <- data.frame(
+ species = c("A","A","B","B"),
+ station = c(1,2,1,2),
+ L11_12 = c(0,0,0,0),
+ L13_14 = c(7,7,6,5),
+ L15_16 = c(2,4,5,7),
+ L17_18 = c(0,0,0,0),
+ L19_20 = c(0,0,0,0)
+ )
> cnt_table
species station L11_12 L13_14 L15_16 L17_18 L19_20
1 A 1 0 7 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
First, you need to use cut to bin the lengths. Then you can use complete to fill the missing counts with 0. Then, group_by species, station and bin and use summarize to add the counts per group. Last, use pivot_wider to make the bins column labels.
Note 1: The result differs from your expected output, but I think you have a typo.
Note 2: I don't know if teh grouping and summing is necessary. In your example it's not, but logically I would include it.
library(tidyverse)
set.seed(10)
df <- data.frame(
species = c(rep("A",4), rep("B",4)),
station = rep(1:2, 4),
length = round(rnorm(8, 15, 2)),
count = round(rnorm(8, 5, 2))
)
df
#---------------------
df %>%
mutate(length = cut(length,
breaks = seq(10.5, 20.5, by = 2),
labels = c("L11_12", "L13_14", "L15_16", "L17_18", "L19_20"))) %>%
complete(species, station, length, fill = list(count = 0)) %>%
group_by(species, station, length) %>%
summarize(count = sum(count)) %>%
pivot_wider(names_from = length, values_from = count)
#---------------------
# A tibble: 4 x 7
# Groups: species, station [4]
species station L11_12 L13_14 L15_16 L17_18 L19_20
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 7 0 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
a data.table approach that should scale well
library(data.table)
# create alookup table with categories
lookup <- data.table(
name = paste0("L", seq(11, 19, 2), "_", seq(12,20, 2)),
from = seq(11, 19, 2),
to = seq(12,20, 2)
)
lookup
# join with a non-equi join
setDT(df)[lookup, name := i.name, on = .(length >= from, length <= to)]
# spreak to wide
dcast(df, species + station ~ name, value.var = "count", fun.aggregate = sum)
# species station L11_12 L13_14 L15_16
# 1: A 1 7 0 2
# 2: A 2 0 7 4
# 3: B 1 0 6 5
# 4: B 2 0 5 7
You can do something like:
library(dplyr)
library(tidyr)
# Create the bins
df$bin <- cut(df$length, breaks = seq(from = min(df$length-1), to = max(df$length)+1, by = 2))
# Get the data frame into shape
df |>
mutate(
bin_start = substr(bin, 2,3),
bin_end = as.numeric(substr(bin, 5,6))-1,
bin_name = paste0("L", bin_start, "_", bin_end)) |>
group_by(species, station, bin_name) |>
summarise(n = n()) |>
pivot_wider(
names_from = bin_name,
values_from = n,
values_fill = list(n = 0)
)
# A tibble: 4 x 5
# Groups: species, station [4]
# species station L11_12 L13_14 L15_16
# <chr> <int> <int> <int> <int>
# 1 A 1 1 1 0
# 2 A 2 0 2 0
# 3 B 1 1 0 1
# 4 B 2 0 1 1
Define your desired range r, i.e. 11:20 as shown in OP (length should be even!). Then match df$lengh with a helper-matrix m to get bins, calculate aggregated sums and reshape to wide format, e.g. using reshape2::dcast.
r <- 11:20; m <- matrix(r, ncol=2, byrow=TRUE)
transform(df, bin=factor(sapply(df$length, \(z) which(rowSums(z == m) > 0)),
levels=seq_along(x), labels=apply(m, 1, paste, collapse='_'))) |>
aggregate(count ~ bin + species + station, sum) |>
reshape2::dcast(species + station ~ bin, value.var='count', drop=FALSE, fill=0)
# species station 11_12 13_14 15_16 17_18 19_20
# 1 A 1 7 0 2 0 0
# 2 A 2 0 7 4 0 0
# 3 B 1 0 6 5 0 0
# 4 B 2 0 5 7 0 0
Related
I have the following dataset:
group<- c(rep(1, 200), rep(2, 200), rep(3, 200), rep(4, 200), rep(5, 200), rep(6, 200))
injection<- c(rep(1, 200), rep(0, 600), rep(1, 400))
art_light<- c(rep(1, 400), rep(0, 600), rep(1, 200))
seasonal_light<- c(rep(1, 600), rep(0, 200), rep(1, 200), rep(1, 200))
## generate data frame for species 1-3; where Species B is unaffected by all of these things and A and C are subtly different
## species is a random effect and so results are more generalizable
dat1 <- data.frame(group, injection,art_light,seasonal_light)
dat1$species = "A"
dat2 <- data.frame(group, injection,art_light,seasonal_light)
dat2$species = "B"
dat3 <- data.frame(group, injection,art_light,seasonal_light)
dat3$species = "C"
#################################
# Simulated Response Variables #
#################################
alpha = 1
beta1 = 10
beta2 = 2
beta3 = 20
beta4 = 10
e1= rnorm(1200, 5, sd=1)
e2 = rlnorm(1200)
e = rcauchy(1200)
e3 = floor(runif(1200, min = 0, max = 20))
e4 = rpois(1200, lambda = 4)
e5 = rlnorm(1200)
dat1$lh<-alpha + beta1*injection + beta2*art_light +
beta3*seasonal_light + beta4*injection*seasonal_light + e1
dat1$hb<-alpha + beta1*injection + e2
dat2$lh<- e
dat2$hb<- alpha + beta3*injection + e3
dat3$lh<-alpha + beta1*injection + beta2*art_light +
beta3*seasonal_light +e4
dat3$hb<-alpha + beta2*injection + e5
dat <- do.call("rbind", list(dat1, dat2, dat3))
I want to randomly sample rows within each group with no replacement. Sampling will be done iteratively such that in the first iteration of sampling, the number of rows sampled are the number of rows in group (n) and in the next iteration the number of sampled rows would be n-1.
These results should be combined into a data frame. Each subsample will be distinguished by a new variable sample_num that represents the number of rows sampled. Below is an example of the result data frame for the first 4 samples. The actual result should continue for all possible subsample sizes down to sampling of 10 rows.
samp1<-
dat %>%
group_by(group) %>%
sample_n(size = 600)
samp1$sample_num<-600
samp2<-
dat %>%
group_by(group) %>%
sample_n(size = 599)
samp2$sample_num<-599
samp3<-
dat %>%
group_by(group) %>%
sample_n(size = 598)
samp3$sample_num<-598
samp4<-
dat %>%
group_by(group) %>%
sample_n(size = 597)
samp4$sample_num<-597
samp_dat<- rbind(samp1,samp2,samp3,samp4)
You can do:
library(dplyr)
library(purrr)
res <- map_df(set_names(600:10),
~ dat %>%
group_by(group) %>%
sample_n(size = .x),
.id = "sample_num")
Giving:
# A tibble: 1,081,530 x 8
# Groups: group [6]
sample_num group injection art_light seasonal_light species lh hb
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 600 1 1 1 1 B -403. 38
2 600 1 1 1 1 A 49.3 12.5
3 600 1 1 1 1 B 0.961 31
4 600 1 1 1 1 A 48.7 12.1
5 600 1 1 1 1 B -0.691 23
6 600 1 1 1 1 A 47.7 11.4
7 600 1 1 1 1 C 37 3.55
8 600 1 1 1 1 B -0.327 22
9 600 1 1 1 1 B -7.71 32
10 600 1 1 1 1 B 0.153 36
# ... with 1,081,520 more rows
data.table is incredibly fast at this sort of thing
library(data.table)
rbindlist(lapply(600:10, \(x) setDT(dat)[,.SD[sample(1:.N,x)], by=.(group)][,sample:=x]))
Output:
group injection art_light seasonal_light species lh hb sample
1: 1 1 1 1 C 35.000000 3.196606 600
2: 1 1 1 1 A 46.424639 12.210558 600
3: 1 1 1 1 C 33.000000 5.303823 600
4: 1 1 1 1 A 47.316622 11.814838 600
5: 1 1 1 1 C 39.000000 3.769120 600
---
1081526: 6 1 1 1 A 47.249496 11.360076 10
1081527: 6 1 1 1 B -3.188948 29.000000 10
1081528: 6 1 1 1 A 47.263460 12.062339 10
1081529: 6 1 1 1 C 38.000000 3.307954 10
1081530: 6 1 1 1 B -2.760421 35.000000 10
I have grouped data and I want to create a new variable value that will take the value 0 or 1.
Every group needs at least one observation where value==1.
But groups cannot have more than 2 observations where value==1.
Ideally I can set it so no more than 25% of groups only have one observation where value==1.
library(tidyverse)
set.seed(1)
# sample can break the rules
tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5))) %>%
group_by(group) %>%
mutate(value = sample(c(0, 1), n(), replace = TRUE, prob = c(0.8, 0.2)))
One solution would be to create a listing of your unique group labels and shuffle those (here I get the unique group labels via nest). Then depending on whether the group is in the first 25% of rows of the data frame, you can assign either a) a random number between 1 and 2, or b) always 2. Finally, you can use the assigned number to define how 0s and 1s should be sampled for each group, and then unnest the result.
set.seed(0)
result <- df %>%
nest(data = -group) %>%
.[sample(1:nrow(.), nrow(.)), ] %>% # shuffle the group order
mutate(
value_count = ifelse(row_number() / n() <= 0.25, sample(1:2, n(), replace = T), 2)
) %>%
rowwise() %>%
mutate(
count = nrow(data),
value = list(sample(c(rep(1, value_count), rep(0, count - value_count)), count))
) %>%
unnest(value) %>%
select(-data, -value_count, -count)
group value
<chr> <dbl>
1 B 0
2 B 0
3 B 0
4 B 0
5 B 1
6 B 0
7 A 1
8 A 1
9 A 0
10 D 1
11 D 0
12 D 1
13 D 0
14 D 0
15 C 1
16 C 0
17 C 0
18 C 1
Looks like I was beat to the punch, but here's another way to do it:
library(tidyverse)
set.seed(1)
# sample can break the rules
x <- tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5)))
# Make all 'var' =1, then set all but first of each group to 0.
xx <- x %>% group_by(group) %>%
mutate(var = row_number()) %>%
mutate(var = ifelse(var == 1, 1, 0))
pct_with_two <- .75 # percentage of groups with two 1's
samp_size <- floor(length(unique(xx$group)) * pct_with_two) #round down to whole number
addl_one <- sample(unique(xx$group), size = samp_size, replace = F)
xx %>%
mutate(var2 = case_when(
group %in% addl_one & row_number() == 2 ~ 1,
TRUE ~0)) %>%
mutate(var = var+var2) %>%
select(-var2)
#> # A tibble: 18 x 2
#> # Groups: group [4]
#> group var
#> <chr> <dbl>
#> 1 A 1
#> 2 A 1
#> 3 A 0
#> 4 B 1
#> 5 B 0
#> 6 B 0
#> 7 B 0
#> 8 B 0
#> 9 B 0
#> 10 C 1
#> 11 C 1
#> 12 C 0
#> 13 C 0
#> 14 D 1
#> 15 D 1
#> 16 D 0
#> 17 D 0
#> 18 D 0
Created on 2022-03-11 by the reprex package (v0.3.0)
I have the following data frame ordered by name and time.
set.seed(100)
df <- data.frame('name' = c(rep('x', 6), rep('y', 4)),
'time' = c(rep(1, 2), rep(2, 3), 3, 1, 2, 3, 4),
'score' = c(0, sample(1:10, 3), 0, sample(1:10, 2), 0, sample(1:10, 2))
)
> df
name time score
1 x 1 0
2 x 1 4
3 x 2 3
4 x 2 5
5 x 2 0
6 x 3 1
7 y 1 5
8 y 2 0
9 y 3 5
10 y 4 8
In df$score there are zeros followed by an unknown number of actual values, i.e. df[1:4,], and sometimes there are overlapping df$name between two df$score == 0, i.e. df[6:7,].
I want to change df$time where df$score != 0. Specifically, I want to assign the time value of the closest upper row with df$score == 0 if df$name is matching.
The following code gives the good output but my data have millions of rows so this solution is very inefficient.
score_0 <- append(which(df$score == 0), dim(df)[1] + 1)
for(i in 1:(length(score_0) - 1)) {
df$time[score_0[i]:(score_0[i + 1] - 1)] <-
ifelse(df$name[score_0[i]:(score_0[i + 1] - 1)] == df$name[score_0[i]],
df$time[score_0[i]],
df$time[score_0[i]:(score_0[i + 1] - 1)])
}
> df
name time score
1 x 1 0
2 x 1 4
3 x 1 3
4 x 1 5
5 x 2 0
6 x 2 1
7 y 1 5
8 y 2 0
9 y 2 5
10 y 2 8
Where score_0 gives the index where df$score == 0. We see that df$time[2:4] are now all equal to 1, that in df$time[6:7] only the first one changed because the second have df$name == 'y' and the closest upper row with df$score == 0 has df$name == 'x'. The last two rows also have changed correctly.
You can do it like this:
library(dplyr)
df %>% group_by(name) %>% mutate(ID=cumsum(score==0)) %>%
group_by(name,ID) %>% mutate(time = head(time,1)) %>%
ungroup() %>% select(name,time,score) %>% as.data.frame()
# name time score
# 1 x 1 0
# 2 x 1 8
# 3 x 1 10
# 4 x 1 6
# 5 x 2 0
# 6 x 2 5
# 7 y 1 4
# 8 y 2 0
# 9 y 2 5
# 10 y 2 9
Solution using dplyr and data.table:
library(data.table)
library(dplyr)
df %>%
mutate(
chck = score == 0,
chck_rl = ifelse(score == 0, lead(rleid(chck)), rleid(chck))) %>%
group_by(name, chck_rl) %>% mutate(time = first(time)) %>%
ungroup() %>%
select(-chck_rl, -chck)
Output:
# A tibble: 10 x 3
name time score
<chr> <dbl> <int>
1 x 1 0
2 x 1 2
3 x 1 9
4 x 1 7
5 x 2 0
6 x 2 1
7 y 1 8
8 y 2 0
9 y 2 2
10 y 2 3
Solution only using data.table:
library(data.table)
setDT(df)[, chck_rl := ifelse(score == 0, shift(rleid(score == 0), type = "lead"),
rleid(score == 0))][, time := first(time), by = .(name, chck_rl)][, chck_rl := NULL]
Output:
name time score
1: x 1 0
2: x 1 2
3: x 1 9
4: x 1 7
5: x 2 0
6: x 2 1
7: y 1 8
8: y 2 0
9: y 2 2
10: y 2 3
I have a dataframe df
df <- data.frame(id =c(1,2,1,4,1,5,6),
label=c("a","b", "a", "a","a", "e", "a"),
color = c("g","a","g","g","a","a","a"),
threshold = c(12, 10, 12, 12, 12, 35, 40),
value =c(32.1,0,15.0,10,1,50,45),stringsAsFactors = F
)
Threshold value is based on the label
I should get a table below like this by considering each id,with respective label how many times exceeding its threshold by the value
Color is independent in consideration for calculating the exceed values
I tried like this
final_df <- df %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
group_by(id, label) %>%
summarise(exceed = sum(check))
But instead of getting with respective id i have got the number of total in exceed
With base R only, use aggregate.
aggregate(seq.int(nrow(df)) ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
# id label seq.int(nrow(df))
#1 1 a 2
#2 4 a 0
#3 6 a 1
#4 2 b 0
#5 5 e 1
In order to match the expected output posted in the question, it will take a little extra work.
exceed <- seq.int(nrow(df))
agg <- aggregate(exceed ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
res <- merge(df[1:3], agg)
unique(res)
# id label color exceed
#1 1 a g 2
#3 1 a a 2
#4 2 b a 0
#5 4 a g 0
#6 5 e a 1
#7 6 a a 1
By a small modification of your code:
df %>%
group_by(id, label) %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
summarise(exceed = sum(check)) %>%
group_by(id, label)
id label exceed
<dbl> <chr> <dbl>
1 1 a 2
2 2 b 0
3 4 a 0
4 5 e 1
5 6 a 1
To match the expected output more closely:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(if_else(value > threshold, 1, 0))) %>%
group_by(id, label, color) %>%
filter(row_number() == 1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 1 a a 12 1 2
5 5 e a 35 50 1
6 6 a a 40 45 1
library(dplyr)
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 5 e a 35 50 1
5 6 a a 40 45 1
If you like the output to contain a separate row for each combination, of ID, label and color, just add a new group_by before the slice function:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
group_by(id, label, color) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a a 12 1 2
2 1 a g 12 32.1 2
3 2 b a 10 0 0
4 4 a g 12 10 0
5 5 e a 35 50 1
6 6 a a 40 45 1
A little change in your code
final_df <- df %>% mutate(check = if_else(value > threshold, 1, 0)) %>% group_by(id, label) %>% filter(check==1)
unique(final_df$id)
We could use table and merge :
table_ <- table(subset(df,value>threshold, c("id","label")))
df2 <- merge(unique(df[c("id","label","color")]),table_,all.x=TRUE)
df2$Freq[is.na(df2$Freq)] <- 0
# id label color Freq
# 1 1 a g 2
# 2 1 a a 2
# 3 2 b a 0
# 4 4 a g 0
# 5 5 e a 1
# 6 6 a a 1
I have a data frame containing location (loc), distance within location (dist), value (cumRate), and year (year) columns.
I would like to compare rates between distances, identify which one is higher and find a year, when rates in one zone become higher then in another, as shown below (in year 2, the distance "100" was higher then the distance "npr")
[![enter code here][1]][1]
This seems easy, but I don't really know where to start... Thank you for your suggestions!
Dummy data:
loc = rep(c("a","b"), each = 6)
dist = rep(c("npr", "100", "npr", "100"), each = 3)
cumRate = c(0,0,4,0,1,2,0,0,1,3,5,7)
year = rep(c(1,2,3), 4)
df = data.frame(loc, dist, cumRate, year)
loc dist cumRate year
1 a npr 0 1
2 a npr 0 2
3 a npr 4 3
4 a 100 0 1
5 a 100 1 2
6 a 100 2 3
7 b npr 0 1
8 b npr 0 2
9 b npr 1 3
10 b 100 3 1
11 b 100 5 2
12 b 100 7 3
Plot data
windows()
ggplot(df, aes(x = year,
y = cumRate,
fill = dist,
colour = dist)) +
geom_line() +
theme_bw() +
facet_grid(.~ loc)
Desired output
outDf
loc dist year
a 100 2
b 100 1
Here's a method without spreading:
library(dplyr)
df %>% group_by(loc, year) %>%
filter(max(cumRate) != min(cumRate)) %>%
arrange(loc, year, desc(cumRate)) %>%
group_by(loc) %>%
slice(1)
# # A tibble: 2 x 4
# # Groups: loc [2]
# loc dist cumRate year
# <fctr> <fctr> <dbl> <dbl>
# 1 a 100 1 2
# 2 b 100 3 1
First we remove years where there is no variation in cumRate, then we sort the data by location, year, and descending cum rate, and take the first row within each location.
I think you'd need to unpivot the dist column:
library(dplyr)
library(tidyr)
df %>%
spread(dist, cumRate) %>%
mutate(higher_dist = case_when(
`100` > npr ~ '100',
npr > `100` ~ 'npr',
TRUE ~ 'equal')
) %>%
filter(npr != `100`) %>%
group_by(loc) %>%
arrange(year) %>%
slice(1)
loc year `100` npr higher_dist
<fct> <dbl> <dbl> <dbl> <chr>
1 a 2 1 0 100
2 b 1 3 0 100