I have a dataset with dead bird records from field observers.
Death.Date Observer Species Bird.ID
1 03/08/2021 DA MF FC10682
2 15/08/2021 AG MF FC10698
3 12/01/2022 DA MF FC20957
4 09/02/2022 DA MF FC10708
I want to produce a dataset from this with the number of unique Bird.ID / Month so I can produce a graph from that. ("unique" because some people make mistakes and enter a bird twice sometimes).
The output in this case would be:
Month Number of dead
08/2021 2
01/2022 1
02/2022 1
The idea is to use the distinct function but by month (knowing the value is in date format dd/mm/yyyy).
In case your Date column is character type first transform to date type with dmy
Change format to month and year
group_by and summarize
library(dplyr)
library(lubridate) # in case your Date is in character format
df %>%
mutate(Death.Date = dmy(Death.Date)) %>% # you may not need this line
mutate(Month = format(as.Date(Death.Date), "%m/%Y")) %>%
group_by(Month) %>%
summarise(`Number of dead`=n())
Month `Number of dead`
<chr> <int>
1 01/2022 1
2 02/2022 1
3 08/2021 2
For completeness, this can be achieved using aggregate without any additional packages:
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
aggregate.data.frame(
x = df["Bird.ID"],
by = list(death_month = format(as.Date(df$Death.Date, "%d/%m/%Y"), "%m/%Y")),
FUN = function(x) {length(unique(x))}
)
Notes
The anonymous function function(x) {length(unique(x)) provides the count of the unique values
format(as.Date(df$Death.Date, "%d/%m/%Y"), "%m/%Y")) call ensures that the month/Year string is provided
data.table solution
library(data.table)
library(lubridate)
# Reproductible example with a duplicated bird
deadbirds <- data.table::data.table(Death.Date = c("03/08/2021", "15/08/2021", "12/01/2022", "09/02/2022", "03/08/2021"),
Observer = c("DA", "AG", "DA", "DA", "DA"),
Species = c("MF", "MF", "MF" , "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708", "FC10682"))
# Clean dataset = option 1 : delete all duplicated row
deadbirds <- base::unique(deadbirds)
# Clean dataset = option 2 : keep only the first line by bird (can be useful when there is duplicated data with differents values in useless columns)
deadbirds <- deadbirds[
j = .SD[1],
by = c("Bird.ID")
]
# Death.Date as date
deadbirds <- deadbirds[
j = Death.Date := lubridate::dmy(Death.Date)
]
# Create month.Death.Date
deadbirds <- deadbirds[
j = month.Death.Date := base::paste0(lubridate::month(Death.Date),
"/",
lubridate::year(Death.Date))
]
# Count by month
deadbirds <- deadbirds[
j = `Number of dead` := .N,
by = month.Death.Date]
A possible solution, based on tidyverse, lubridate and zoo::as.yearmon:
library(tidyverse)
library(lubridate)
library(zoo)
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
df %>%
group_by(date = as.yearmon(dmy(Death.Date))) %>%
summarise(nDead = n_distinct(Bird.ID), .groups = "drop")
#> # A tibble: 3 x 2
#> date nDead
#> <yearmon> <int>
#> 1 Aug 2021 2
#> 2 Jan 2022 1
#> 3 Feb 2022 1
You could use:
as.data.frame(table(format(as.Date(df$Death.Date,'%d/%m/%Y'), '%m/%Y')))
# Var1 Freq
# 1 01/2022 1
# 2 02/2022 1
# 3 08/2021 2
data:
df <- data.frame(
Death.Date = c("3/8/2021", "15/08/2021", "12/1/2022", "9/2/2022"),
Observer = c("DA", "AG", "DA", "DA"),
Species = c("MF", "MF", "MF", "MF"),
Bird.ID = c("FC10682", "FC10698", "FC20957", "FC10708")
)
Related
I have the following example of dataset for a biology project.
I want to compute a rate of growth of number between 4th January and 2nd of January.
That is rate = (number_at_0104 - number_at_0102)/(number_at_0102)
(In tidyverse if possible)
a <- c("Date", "Specie", "Number")
b <- c("2020-01-01", "Dog", "3")
c <- c("2020-01-02", "Dog", "4")
d <- c("2020-01-03", "Dog", "5")
e <- c("2020-01-04", "Dog", "6")
f <- c("2020-01-01", "Cat", "3")
g <- c("2020-01-02", "Cat", "7")
h <- c("2020-01-03", "Cat", "8")
i <- c("2020-01-04", "Cat", "10")
df <- as.data.frame(rbind(b, c, d, e, f, g, h, i))
names(df) <- a
df$Date <- as.Date(df$Date)
df$Number <- as.integer(df$Number)
I want to calculate a rate of growth. I know this has been treated already but I'm not sure whether I can apply it there.
Usually, we use the lag() function but I have some questions.
Can we tell the lag function what lags to use (eg not the previous periods but 4 periods before)
My dataset is much bigger and for some specie (say cat) I want to compute the rate of growth between 20 Feb and 3 March. And for others (say dog) I want to compute between 5 May and 4th April. How can I do it?
Thank you in advance,
using dplyr
start <- as.Date("2020-01-02")
end <- as.Date("2020-01-04")
df %>%
filter(Date %between% c(start, end)) %>%
arrange(Date, Species) %>%
group_by(Species) %>%
summarise(Growth = (last(Number) - first(Number)) / first(Number))
output
Species Growth
<fct> <dbl>
1 Cat 0.25
2 Dog 0.5
data
note that my test data is already as dates and as numeric values
df <- data.frame(
Date = rep(seq.Date(as.Date("2020-01-01"), as.Date("2020-01-04"), "days"), 2),
Species = c(rep("Dog", 4), rep("Cat", 4)),
Number = 3:10
)
if you want different lookups for every species you can do something like this. Define your lookups and the output will be a tibble with Species, Growth and the period it was taken from.
lookups <- list(
c("Species" = "Dog", "start" = "2020-01-01", "end" = "2020-01-04"),
c("Species" = "Cat", "start" = "2020-01-02", "end" = "2020-01-04")
)
bind_rows(lapply(lookups, function(species) {
df %>%
filter(Species == species["Species"] & Date %between% as.Date(c(species["start"], species["end"]))) %>%
arrange(Date, Species) %>%
group_by(Species) %>%
summarise(
Growth = (last(Number) - first(Number)) / first(Number),
Start = species["start"],
End = species["end"]
)
}))
# # A tibble: 2 x 4
# Species Growth Start End
# <chr> <dbl> <chr> <chr>
# 1 Dog 1 2020-01-01 2020-01-04
# 2 Cat 0.25 2020-01-02 2020-01-04
Since you want to have different input for different species and different start and end dates, I have created a function to do the job.
Function compute_rate_in_species()
x is a dataframe
species is Species in x, which can be supplied in a vector (e.g. c("Dog", "Cat"))
start_date and end_date is self explanatory
DF is a logical value indicating whether you want dataframe as output. If FALSE, a named vector with be the output
If you would like to have different start_date and end_date for different species, you'll have to run the function separately using your desired input.
library(tidyverse)
compute_rate_in_species <- function(x, species, start_date, end_date, DF = T) {
x <- x %>% filter(Species %in% species & Date %in% as.Date(c(start_date, end_date))) %>%
group_by(Species) %>%
summarize(Rate = (last(Number) - first(Number))/first(Number))
if (DF == T) {
return(x)
} else {
x_vec <- setNames(x$Rate, x$Species)
return(x_vec)
}
}
Output
# DF = F
compute_rate_in_species(df, c("Dog", "Cat"), "2020-01-01", "2020-01-03", DF = F)
Cat Dog
1.6666667 0.6666667
# DF = T
compute_rate_in_species(df, c("Dog", "Cat"), "2020-01-01", "2020-01-03", DF = T)
# A tibble: 2 x 2
Species Rate
<chr> <dbl>
1 Cat 1.67
2 Dog 0.667
You can use -
library(dplyr)
start_date <- as.Date("2020-01-02")
end_date <- as.Date("2020-01-04")
df %>%
group_by(Specie) %>%
summarise(growth_rate = (Number[match(end_date, Date)] -
Number[match(start_date, Date)])/
Number[match(start_date, Date)])
# Specie growth_rate
# <chr> <dbl>
#1 Cat 0.429
#2 Dog 0.5
You can replace the start_date and end_date with the dates of your choice.
Or maybe a bit verbose but clearer answer would be -
df %>%
group_by(Specie) %>%
summarise(num_end = Number[match(end_date, Date)],
num_start = Number[match(start_date, Date)],
growth_rate = (num_end - num_start)/num_start)
# Specie num_end num_start growth_rate
# <chr> <int> <int> <dbl>
#1 Cat 10 7 0.429
#2 Dog 6 4 0.5
I've tried to create a minimal example, sorry.
Is there a way to speed the process?
My procedures table has 4M rows. I am processing for 15 hours and it has populate only 1.5M rows.
Maybe using mutate, I don't know.
library(tidyverse)
library(lubridate)
frequencies <- tibble(
id = 1:3,
date_hour_initial = c(
dmy_hms('01/01/2020 13:00:00'),
dmy_hms('01/01/2020 15:00:00'),
dmy_hms('02/01/2020 20:00:00')
),
date_hour_final= c(
dmy_hms('01/01/2020 18:00:00'),
dmy_hms('01/01/2020 22:00:00'),
dmy_hms('03/01/2020 05:00:00')
),
id_person = c("1", "2", "2"),
type_service = c("1", "3", "4")
) %>%
mutate(
intervalo = interval(
date_hour_initial,
date_hour_final
)
)
procedures <- tibble(
id = 1:3,
date_hour = c(
dmy_hms('01/01/2020 17:00:00'),
dmy_hms('01/01/2020 22:00:00'),
dmy_hms('03/01/2020 03:00:00')
),
id_person = c("1", "1", "2")
)
procedures$type_service <- vector(
"character",
nrow(procedures)
)
for(i in 1:nrow(procedures)) {
frequencies %>%
filter(
procedures$date_hour[i] %within% intervalo,
id_person == procedures$id_person[i]
) %>% pull(type_service) %>% unique() -> response
if(length(response) == 1){
procedures$type_service[i] <- response
} else {
procedures$type_service[i] <- NA_character_
}
}
Here's a dplyr solution without using loops:
library(tidyverse)
left_join(frequencies, procedures, by = "id_person") %>%
mutate(type_service = ifelse(date_hour %within% intervalo, type_service.x, NA)) %>%
select(id = id.y, date_hour, id_person, type_service) %>%
group_by(id) %>%
arrange(type_service) %>%
filter(!duplicated(id)) %>%
ungroup() %>%
arrange(id)
#> # A tibble: 3 x 4
#> id date_hour id_person type_service
#> <int> <dttm> <chr> <chr>
#> 1 1 2020-01-01 17:00:00 1 1
#> 2 2 2020-01-01 22:00:00 1 NA
#> 3 3 2020-01-03 03:00:00 2 4
Here is an option using non-equi join in data.table:
procedures[, type_service :=
frequencies[procedures, on=.(id_person, date_hour_initial<=date_hour, date_hour_final>=date_hour),
by=.EACHI, if (length(x.type_service) == 1L) x.type_service]$V1
]
output:
id date_hour id_person type_service
1: 1 2020-01-01 17:00:00 1 1
2: 2 2020-01-01 22:00:00 1 <NA>
3: 3 2020-01-03 03:00:00 2 4
data:
library(data.table)
frequencies <- data.table(id = 1:3,
date_hour_initial = as.POSIXct(c('01/01/2020 13:00:00','01/01/2020 15:00:00','02/01/2020 20:00:00'), format="%d/%m/%Y %T"),
date_hour_final= as.POSIXct(c('01/01/2020 18:00:00','01/01/2020 22:00:00','03/01/2020 05:00:00'), format="%d/%m/%Y %T"),
id_person = c("1", "2", "2"),
type_service = c("1", "3", "4"))
procedures <- data.table(id = 1:3,
date_hour = as.POSIXct(c('01/01/2020 17:00:00','01/01/2020 22:00:00','03/01/2020 03:00:00'), format="%d/%m/%Y %T"),
id_person = c("1", "1", "2"))
My guess is that this will take around a min for 4 mio rows?
Here is a solution using the fuzzy join package. First step is split the frequency and procedure dataframes by person Id. This is dividing the large problem into many smaller problem. I did not add any error checking to ensure there is a corresponding match between the person_id between the 2 data frames.
Once the data frames are split, loop through each person id and using the left_fuzzy_join function to match "data_hour" in procedures and the "interval" in frequencies
library(lubridate)
library(dplyr)
#divide and conquer
#split the data frame down to list by person_id
sfreq<-split(frequencies, frequencies$id_person)
sprocedures <- split(procedures, procedures$id_person)
library(fuzzyjoin)
#define function for the matching
matfun<-function(x, y){
x %within% y
}
#define empty answer list
answer<-list()
#loop thru all of the split groups
for (id in names(sfreq)) {
print(id)
#perfrom a fuzzy join with data_hour in procedures and the interval in frequencies
answer[[id]]<-fuzzy_left_join(sprocedures[[id]], sfreq[[id]], by= c("date_hour" ="intervalo"), match_fun=matfun)
}
#Combine all of the subsets into the final answer
finalanswer<-bind_rows(answer)
I'm quite new to R and I'm facing a problem which I guess is quite easy to fix but I couldn't find the answer.
I have a dataframe called clg where basically I have 3 columns date, X1, X2.
X1 and X2 are name of country teams. X1 and X2 have the same list of countries.
I'm simply trying to count the frequency of each country in the two columns as a total.
So far, I've only been able to count the frequency of the X1 column but I didn't find a way to sum both columns.
clt <- as_tibble(na.omit(count(clg, clg$X1)))
I would like to get a data frame where in the first columns I have unique countries, and in the second column the sum of occurrences in X1 + X2.
You can useunlist() and table() to get the overall counts. Wrapping it in data.frame() will give you the desired two column output.
clg <- data.frame(date=1:3,
X1=c("nor", "swe", "alg"),
X2=c("swe", "alg", "jpn"))
data.frame(table(unlist(clg[c("X1", "X2")])))
# Var1 Freq
# 1 alg 2
# 2 nor 1
# 3 swe 2
# 4 jpn 1
With tidyverse, we can gather into 'long' format and then do the count
library(tidyverse)
gather(clg, key, Var1, -date) %>%
count(Var1)
# A tibble: 4 x 2
# Var1 n
# <chr> <int>
#1 alg 2
#2 jpn 1
#3 nor 1
#4 swe 2
data
clg <- structure(list(date = 1:3, X1 = structure(c(2L, 3L, 1L), .Label = c("alg",
"nor", "swe"), class = "factor"), X2 = structure(c(3L, 1L, 2L
), .Label = c("alg", "jpn", "swe"), class = "factor")),
class = "data.frame", row.names = c(NA,
-3L))
You can obtain your goal with two steps. In the first step, you calculate the sum of occurrences for each country. In the next step, you're joining the two df's together and calculate the total sum.
X1_sum <- df %>%
dplyr::group_by(X1) %>%
dplyr::summarize(n_x1 = n())
X2_sum <- df %>%
dplyr::group_by(X2) %>%
dplyr::summarize(n_x2 = n()
final_summary <- X1_sum %>%
# merging data with by country names
dplyr::left_join(., X2_sum, by = c("X1", "X2")) %>%
dplyr::mutate(n_sum = n_x1 + n_x2)
I am trying to do a count of rows that fall on and between two dates (minimum and maximum) per group. The only caveat is each group has a different pair of dates. See example below.
This is my raw dataset.
raw <- data.frame ("Group" = c("A", "B", "A", "A", "B"), "Date" = c("2017-01-01", "2017-02-02", "2017-09-01", "2017-12-31", "2017-05-09"))
I would like it to return this...
clean <- data.frame ("Group" = c("A", "B"), "Min" = c("2017-01-01", "2017-02-02"), "Max" = c("2017-12-31", "2017-05-09"), "Count" = c(3, 2))
How would I be able to do this? The mix and max variable are not crucial, but definitely would like to know how to do the count variable. Thank you!
The date range is given or you want to calculate it from data as well. If later is true then this should do it.
require(tidyverse)
raw %>%
mutate(Date = as.Date(Date)) %>%
group_by(Group) %>%
summarise(min_date = min(Date), max_date = max(Date), count = n())
Output:
# A tibble: 2 x 4
Group min_date max_date count
<fct> <date> <date> <int>
1 A 2017-01-01 2017-12-31 3
2 B 2017-02-02 2017-05-09 2
I have a large dataset with 3 columns: Name, Country, and Sales.
I'd like to sum the Sales column by Names that are both identical and occur consecutively. Then I'd like to remove all rows but the first occurrence of a series, replacing the value of Sales with the series sum.
For example:
Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100
Would be reduced to:
Name,Country,Sales
A,V,200
B,X,200
A,Z,100
Anyone got any idea how to do this?
Your data
df <- structure(list(Name = c("A", "A", "B"), Country = c("X", "Y",
"Z"), Sales = c(100L, 100L, 100L)), .Names = c("Name", "Country",
"Sales"), row.names = c(NA, -3L), class = c("data.table", "data.frame"
))
dplyr solution
library(dplyr)
library(data.table)
ans <- df %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<chr> <int>
1 A 200
2 B 100
Alternative example
newdf <- rbind(df, data.frame(Name=c("A","A","B","B"),
Country=c("A","B","C","D"),
Sales=c(100,100,100,100)))
ans <- newdf %>%
group_by(rleid(Name)) %>%
summarise(Name = unique(Name), Sales=sum(Sales)) %>%
select(-1)
Output
Name Sales
<fctr> <dbl>
1 A 200
2 B 100
3 A 200
4 B 200
Here's another solution using sqldf:
library(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")
df$rle = rleid(df$Name)
library(sqldf)
sqldf("select min(rowid) as row_names,
Name,
Country,
sum(Sales) as Sales
from df group by rle", row.names = TRUE)
# Name Country Sales
# 1 A V 200
# 3 B X 200
# 5 A Z 100
row.names = TRUE searches for a column named row_names and treats it as row names, so min(rowid) will not show up as a new column if I set it as row_names.
Try this:
require(dplyr)
df %>%
group_by(Series=rleid(Name)) %>%
mutate(Sales = sum(Sales)) %>%
filter(1:n() == 1)
Output:
Name Country Sales Series
1 A V 200 1
2 B X 200 2
3 A Z 100 3
Sample data:
require(data.table)
df <- fread("Name,Country,Sales
A,V,100
A,W,100
B,X,100
B,Y,100
A,Z,100")