How to compile the number of observations that follow a specific pattern? - r

I have a dataset with three variables (DateTime, Transmitter, and timediff). The timediff column is the time difference between subsequent detections of a transmitter. I want to know how many times the time differences followed a specific pattern. Here is a sample of my data.
> dput(Example)
structure(list(DateTime = structure(c(1501117802, 1501117805,
1501117853, 1501117857, 1501117913, 1501117917, 1501186253, 1501186254,
1501186363, 1501186365, 1501186541, 1501186542, 1501186550, 1501186590,
1501186591, 1501186644, 1501186646, 1501186737, 1501186739, 1501187151
), class = c("POSIXct", "POSIXt"), tzone = "GMT"), Transmitter = c(30767L,
30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L,
30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L, 30767L,
30767L, 30767L, 30767L), timediff = c(44, 3, 48, 4, 56, 4, 50,
1, 42, 2, 56, 1, 8, 40, 1, 53, 2, 37, 2, 42)), row.names = c(NA,
20L), class = "data.frame")
So looking at the time difference column, I want to know how many times there is a single timediff < 8seconds, how many times there are two subsequent timediff < 8 seconds, how many times there are three subsequent timediff < 8 seconds, and so on.
Example: In the given dataset, a single timediff <8 seconds happens 7 times while two subsequent timediffs < 8 seconds happens twice.
A "single timediff" = 44, 3 , 48
A "double timediff" = 56, 1, 8, 40
In terms of an output, I'd be looking for something like this...
> dput(output)
structure(list(ID = 30767, Single = 7, Double = 2), class = "data.frame", row.names = c(NA,
-1L))
Thanks for the help!

One dplyr possibility could be:
df %>%
mutate(cond = timediff <= 8) %>%
group_by(rleid = with(rle(cond), rep(seq_along(lengths), lengths))) %>%
add_count(rleid, name = "n_timediff") %>%
filter(cond & row_number() == 1) %>%
ungroup() %>%
count(n_timediff)
n_timediff n
<int> <int>
1 1 8
2 2 1
Considering there could be more values in "Transmitter", you can do (this requires also tidyr):
df %>%
mutate(cond = timediff <= 8) %>%
group_by(Transmitter, rleid = with(rle(cond), rep(seq_along(lengths), lengths))) %>%
add_count(rleid, name = "n_timediff") %>%
filter(cond & row_number() == 1) %>%
ungroup() %>%
group_by(Transmitter) %>%
count(n_timediff) %>%
mutate(n_timediff = paste("timediff", n_timediff, sep = "_")) %>%
spread(n_timediff, n)
Transmitter timediff_1 timediff_2
<int> <int> <int>
1 30767 8 1

Related

Summarize values every 3 days based on date column

Following is the dput() output of DT. I would like to sum the values every 3 days, starting from the MIN of DATE, group by ID.
structure(list(ID = c("pqr", "abc", "ort", "kkg", "ssc", "ccv",
"xyz", "xyz", "xyz"), DATE = c("2022-06-07", "2022-06-24", "2022-06-02",
"2022-06-01", "2022-06-16", "2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
), READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200
), READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34,
556.47, 462.75)), class = "data.frame", row.names = c(NA, -9L))
Following is an unsuccessful attempt.
DT$DATE = as.Date(DT$DATE, format = "%Y-%m-%d")
DT1 = DT %>%
group_by(ID, group = cut(as.Date(DT$DATE, format = "%Y-%m-%d"), '3 days')) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = sum(READING_IN),
sum_out = sum(READING_OUT), .groups = 'drop') %>%
select(-group)
Result:
structure(list(ID = c("abc", "ccv", "kkg", "ort", "pqr", "ssc",
"xyz", "xyz", "xyz"), date_range = c("2022-06-24-2022-06-26",
"2022-06-07-2022-06-09", "2022-06-01-2022-06-03", "2022-06-02-2022-06-04",
"2022-06-07-2022-06-09", "2022-06-16-2022-06-18", "2022-06-11-2022-06-13",
"2022-06-13-2022-06-15", "2022-06-27-2022-06-29"), sum_in = c(2800,
500, 500, 600, 150, 1395.94, 800, 179, 200), sum_out = c(2800,
501.4, 500, 600, 150, 1400, 371.34, 556.47, 462.75)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
Desired Output for ID = xyz:
ID
DATE
READING_IN
READING_OUT
xyz
2022-06-11 to 2022-06-13
979
927.81
xyz
2022-06-27 to 2022-06-29
200
462.75
I understand the issue here is the entry on 2022-06-13, an entry which should be aggregated in 2022-06-11 + 2 window. Is there any way to sum the values every 3 days aligned to desired output format?
I believe you were tricked by some group_by() details:
Computations are always done on the ungrouped data frame. To perform computations on the grouped data, you need to use a separate mutate() step before the group_by().
With extra mutate() + group_by() step it seems to behave like described:
library(tibble)
library(dplyr)
DT %>%
mutate(DATE = as.Date(DATE, format = "%Y-%m-%d")) %>%
group_by(ID) %>%
mutate(date_group = cut(DATE, '3 days')) %>%
group_by(ID, date_group) %>%
summarise(date_range = paste(min(DATE), min(DATE) + 2, sep = ' to '),
sum_in = num(sum(READING_IN), digits = 2),
sum_out = num(sum(READING_OUT),digits = 2), .groups = 'drop') %>%
select(-date_group)
#> # A tibble: 8 × 4
#> ID date_range sum_in sum_out
#> <chr> <chr> <num:.2!> <num:.2!>
#> 1 abc 2022-06-24 to 2022-06-26 2800.00 2800.00
#> 2 ccv 2022-06-07 to 2022-06-09 500.00 501.40
#> 3 kkg 2022-06-01 to 2022-06-03 500.00 500.00
#> 4 ort 2022-06-02 to 2022-06-04 600.00 600.00
#> 5 pqr 2022-06-07 to 2022-06-09 150.00 150.00
#> 6 ssc 2022-06-16 to 2022-06-18 1395.94 1400.00
#> 7 xyz 2022-06-11 to 2022-06-13 979.00 927.81
#> 8 xyz 2022-06-27 to 2022-06-29 200.00 462.75
Input:
DT <- structure(list(
ID = c(
"pqr", "abc", "ort", "kkg", "ssc", "ccv", "xyz", "xyz", "xyz"
),
DATE = c(
"2022-06-07", "2022-06-24", "2022-06-02", "2022-06-01", "2022-06-16",
"2022-06-07", "2022-06-11", "2022-06-13", "2022-06-27"
),
READING_IN = c(150, 2800, 600, 500, 1395.94, 500, 800, 179, 200),
READING_OUT = c(150, 2800, 600, 500, 1400, 501.4, 371.34, 556.47, 462.75)
), class = "data.frame", row.names = c(NA, -9L))
Created on 2023-01-18 with reprex v2.0.2

How to add a new column based on a few other variables

I am new to R and am having trouble creating a new variable using conditions from already existing variables. I have a dataset that has a few columns: Name, Month, Binary for Gender, and Price. I want to create a new variable, Price2, that will:
make the price charged 20 if [the month is 6-9(Jun-Sept) and Gender is 0]
make the price charged 30 if [the month is 6-9(Jun-Sept) and Gender is 1]
make the price charged 0 if [the month is 1-5(Jan-May) or month is 10-12(Oct-Dec]
--
structure(list(Name = c("ADI", "SLI", "SKL", "SNK", "SIIEL", "DJD"), Mon = c(1, 2, 3, 4, 5, 6), Gender = c(1, NA, NA, NA, 1, NA), Price = c(23, 34, 32, 64, 23, 34)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Using case_when() from the dplyr package:
mydf$newprice <- dplyr::case_when(
mydf$Mon >= 6 & mydf$Mon <= 9 & mydf$Gender == 0 ~ 20,
mydf$Mon >= 6 & mydf$Mon <= 9 & mydf$Gender == 1 ~ 30,
mydf$Mon < 6 | mydf$Mon > 9 ~ 0)

How to do countif in R based on dates

I have this data in my excel files, and it has so much data to count if I do it in Excel. I want to count how many days in 1 month have a value of more than 50.
I'd like to turn it into something like :
Could someone help me to solve this?
Another option is count with as.yearmon from zoo - filter the rows where 'Value' is greater than 50, then use count after converting to yearmon class with as.yearmon
library(dplyr)
library(zoo)
df %>%
filter(Value > 50) %>%
count(month_year = as.yearmon(Date))
-ouptut
month_year n
1 Jan 2010 3
2 Feb 2010 1
data
df <- structure(list(Date = structure(c(14610, 14611, 14612, 14618,
14618, 14624, 14641), class = "Date"), Value = c(27, 35, 78,
88, 57, 48, 99)), class = "data.frame", row.names = c(NA, -7L
))
Suppose your data is given by
df <- data.frame(Date = as.Date(c("1/1/2010", "1/2/2010", "1/3/2010", "1/9/2010", "1/9/2010", "1/15/2010", "2/1/2010"), "%m/%d/%Y"),
Value = c(27, 35, 78, 88, 57, 48, 99))
To count your specific values you could use
library(dplyr)
df %>%
group_by(month_year = format(Date, "%m-%y")) %>%
summarise(count = sum(Value > 50))
which returns
# A tibble: 2 x 2
month_year count
<chr> <int>
1 01-10 3
2 02-10 1
Note: Your Date column has to contain dates (as in as.Date).

Sliding windows: compare series with all series before/after

I'm fairly new to rolling windows. I'm looking to calculate a function that compares, say, a correlation between a window in the data vs. all windows before/after of the same size. Assume no gaps. I'd like to use a tidyverse-sque approach such as tsibble and/or #Davis Vaughan slider
df <- structure(list(sales = c(2, 4, 6, 2, 8, 10, 9, 3, 5, 2), index = structure(c(1567123200, 1567209600, 1567296000, 1567382400, 1567468800, 1567555200, 1567641600, 1567728000, 1567814400, 1567900800), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -10L), class = ("tbl_df", "tbl", "data.frame"))
Suppose I want to calculate the Pearson correlation between the first 3 days of the series vs. all previous 3 days windows:
We could create a grouping index with gl for every 3 rows after removing the first 3 rows, then do the cor between the first 3 and each of the blocks of 'sales'
library(dplyr)
n <- 3
df %>%
slice(-seq_len(n)) %>%
group_by(grp = as.integer(gl(n(), n, n()))) %>%
filter(n() == n) %>%
summarise(cor = cor(df$sales[seq_len(n)], sales))
-output
# A tibble: 2 x 2
# grp cor
# <int> <dbl>
#1 1 0.961
#2 2 -0.655
data
df <- data.frame(sales = c(2, 4, 6, 2, 8, 10, 9, 3, 5, 2),
index = seq(as.Date("2019-08-30"), length.out = 10, by = '1 day'))

R: new column based whether categorical levels of another column are the same or different from each other

I am having a problem creating a new column in a data where the column content is defined by levels in a factor in a different column are the same or different, which is dependent on another 2 columns.
Basically, I have a bunch of cows with different ID's that can have different parities. The quarter is the udder quarter affected by the disease and I would like to create a new column with a result that is based on whether quarters are the same or different or occurring once. Any help would be appreciated. Code for abbreviated data frame below/ The new column is the one I would like to achieve.
AnimalID <- c(10,10,10,10,12,12,12,12,14)
Parity <- c(8,8,9,9,4,4,4,4,2)
Udder_quarter <- c("LH","LH","RH","RH","LH","RH","LF","RF","RF")
new_column <- c("same quarter","same quarter","different quarter","different quarter","different quarter","different quarter","different quarter","different quarter","one quarter")
quarters<- data.frame(AnimalID,Parity,Udder_quarter,new_column)
structure(list(HerdAnimalID = c(100165, 100165, 100327, 100327,
100450, 100450), Parity = c(6, 6, 5, 5, 3, 3), no_parities = c(1,
1, 1, 1, 1, 1), case = c("1pathogen_lact", "1pathogen_lact",
"1pathogen_lact", "1pathogen_lact", "1pathogen_lact", "1pathogen_lact"
), FARM = c(1, 1, 1, 1, 1, 1), `CASE NO` = c("101", "101", "638",
"638", "593", "593"), MASTDATE = structure(c(1085529600, 1087689600,
1097884800, 1101254400, 1106092800, 1106784000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), QRT = c("LF", "LF", "RH", "LF", "LH",
"LH"), MastitisDiagnosis = c("Corynebacterium spp", "Corynebacterium spp",
"S. uberis", "S. uberis", "Bacillus spp", "Bacillus spp"), PrevCalvDate =
structure(c(1075334400,
1075334400, 1096156800, 1096156800, 1091145600, 1091145600), class =
c("POSIXct",
"POSIXt"), tzone = "UTC")), .Names = c("HerdAnimalID", "Parity",
"no_parities", "case", "FARM", "CASE NO", "MASTDATE", "QRT",
"MastitisDiagnosis", "PrevCalvDate"), row.names = c(NA, -6L), class =
c("tbl_df",
"tbl", "data.frame"))
library(dplyr)
quarters %>%
group_by(AnimalID) %>%
mutate(new_column = ifelse(n()==1, 'one quarter', NA)) %>%
group_by(Parity, add=T) %>%
mutate(new_column=ifelse(length(unique(Udder_quarter))==1 & is.na(new_column),
"same quarter",
ifelse(length(unique(Udder_quarter))>1,
"different quarter",
new_column))) %>%
data.frame()
Output is:
AnimalID Parity Udder_quarter new_column
1 10 8 LH same quarter
2 10 8 LH same quarter
3 10 9 RH same quarter
4 10 9 RH same quarter
5 12 4 LH different quarter
6 12 4 RH different quarter
7 12 4 LF different quarter
8 12 4 RF different quarter
9 14 2 RF one quarter
Sample data:
quarters <- structure(list(AnimalID = c(10, 10, 10, 10, 12, 12, 12, 12, 14
), Parity = c(8, 8, 9, 9, 4, 4, 4, 4, 2), Udder_quarter = structure(c(2L,
2L, 4L, 4L, 2L, 4L, 1L, 3L, 3L), .Label = c("LF", "LH", "RF",
"RH"), class = "factor")), .Names = c("AnimalID", "Parity", "Udder_quarter"
), row.names = c(NA, -9L), class = "data.frame")
I would use ave to do that:
f <- function(x) {
if (length(x)==1) return("one")
else if (all(x == x[1])) return("same")
else return("different")
}
ave(Udder_quarter, interaction(AnimalID, Parity), FUN=f)
# [1] "same" "same" "same" "same" "different"
# [6] "different" "different" "different" "one"

Resources