The function works fine on a df containing 1000 to 20,000 cases but anything more than that and it takes hours (5 hours plus) and right now i have a df that's 57635985 observation long
Suppose a df like this:
d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction",
"Correction", "Deficiency", "Deficiency", "Correction","Correction","Deficiency","Correction"),
Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00",
"2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00",
"2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49",
"2020-09-02 00:03:54","2020-09-01 00:04:48","2020-10-01 04:31:00","2020-10-11 04:31:00","2020-10-21 04:31:00"),
CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123","127","127","127")), class = "data.frame", row.names = c(NA, -22L))
d<-d%>%arrange(CaseNo,Submissiondate)
The code below is trying to see for each case number, as long as the status stayed at correction at each week's end, it will be counted in the stats for all the subsequent weeks till the status changes to anything:
d <- d %>% mutate(Submissiondate = as.Date(Submissiondate),
weekday = wday(Submissiondate),
week.end = Submissiondate + 7 - weekday)
#1 End state for each case and week
EndStates <- d %>%
group_by(CaseNo, week.end) %>%
summarize(WeekEndState = last(ReviewType)) %>% ungroup()
#2 Get unique values of Case/Week.end/ReviewType
chk <- d %>%
select(CaseNo, week.end, ReviewType) %>%
distinct()
#3 Add the EndStates and count if the week had a correction AND
# the week ended as a Correction
chk <- left_join(chk, EndStates, by = c("CaseNo", "week.end"))
cor_df <- as.data.frame(matrix(ncol=length(unique(chk$CaseNo))+1,
nrow=length(unique(chk$week.end))))
names(cor_df) <- c("week.end", unique(chk$CaseNo))
cor_df$week.end <- unique(chk$week.end)
for(i in 1:nrow(cor_df)){
for(j in 2:ncol(cor_df)){
this_CaseNo_idx = chk$CaseNo == strtoi(names(cor_df)[j])
idx = this_CaseNo_idx &
chk$week.end == max(chk$week.end[chk$week.end <= cor_df$week.end[i] &
this_CaseNo_idx])
if (sum(idx) < 1){
cor_df[i, j] = 0
}
else{
cor_df[i, j] = max(ifelse(chk$WeekEndState[idx] == "Correction" &
chk$ReviewType[idx] == "Correction",
1, 0))
}
}
}
cor_df$asw <- rowSums(cor_df[,2:ncol(cor_df)])
cor_df <- cor_df[order(cor_df$week.end),]
data.frame(week.end=cor_df$week.end, cor_df$asw)
Expected output:
week.end cor_df.asw
1 2020-08-22 0
2 2020-08-29 1
3 2020-09-05 2
4 2020-10-03 3
5 2020-10-17 2
6 2020-10-24 3
Any guidance appreciated.
I can get you there starting with the EndStates data frame. I'm not sure whether it will be much faster. Because dplyr does thing to the columns all at once (rather than sequentially down the column), I still needed a while() loop to do some of the filling in missing weeks. Perhaps a better dplyr person will provide an alternative.
library(dplyr)
library(tidyr)
cor_df2 <- EndStates %>%
mutate(count = as.numeric(WeekEndState == "Correction")) %>%
select(-WeekEndState) %>%
pivot_wider(id_cols="week.end", names_from="CaseNo", values_from="count") %>%
arrange(week.end) %>%
mutate(across(-week.end, function(x)case_when(is.na(x) & week.end == min(week.end) ~ 0, TRUE ~ x)))
while(any(is.na(cor_df2))){
cor_df2 <- cor_df2 %>% mutate(across(-week.end, function(x)case_when(is.na(x)~lag(x), TRUE ~ x)))
}
cor_df2 <- cor_df2 %>%
mutate(asw = rowSums(.[-1])) %>%
select(week.end, asw)
Related
I have this type of data:
df <- data.frame(
Utt = c(rep("oh", 10), rep("ah", 10)),
name = rep(LETTERS[1:2], 10),
value = c(0.5,2,2,2,2,1,0,1,3.5,1,
2.2,2.3,1.9,0.1,0.3,1.8,3,4,3.5,2)
)
I need to know whether within in each group of Utt and name, there are continuous value increases and how large these increases are.
EDIT: I've cobbled together this code, which produces the right result but seems convoluted:
df %>%
# order by name:
arrange(name) %>%
group_by(name, Utt) %>%
# mutate:
mutate(
# is there an increase from one value to the next?
is_increase = ifelse(lag(value) < value, value, NA),
# what's the difference between these values?
diff = is_increase - lag(value)) %>%
group_by(name, Utt, grp = rleid(!is.na(diff))) %>%
# sum the contiguous values:
summarise(increase_size = sum(diff, na.rm = TRUE)) %>%
# remove 0 values:
filter(!increase_size == 0) %>%
# put same-group increase_sizes in the same row:
summarise(
increase_size = str_c(increase_size, collapse = ', '))
# A tibble: 3 x 3
# Groups: name [2]
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
NOTE: Ideally, the expected outcome would be:
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
Is there a better (i.e., more concise, more clever) dplyr solution?
Use this function to find what you want.
f <- function(x) {
ind <- which(x > lag(x))
if (length(ind) == 0) {
return(NA)
}
ind2 <- ind[which(lead(ind, default = max(ind) + 2) - ind > 1)]
ind1 <- ind[which(ind - lag(ind, default = min(ind) - 2) > 1)] - 1
return(paste0(x[ind2] - x[ind1], collapse = ", "))
}
And use the function in summarise:
df %>% group_by(name, Utt) %>% summarise(increase = f(value))
Using tidyverse, my solution was similar to yours. One possible modification might be to subset your columns before summing instead of filtering. This will keep all combinations of name and Utt and allow for NA for increase_size in the end. Since the column increase_size is character type, you can convert an empty string to NA.
library(data.table)
library(tidyverse)
df %>%
arrange(name) %>%
group_by(name, Utt) %>%
mutate(diff = c(0, diff(value))) %>%
group_by(grp = rleid(diff < 0), .add = T) %>%
summarise(increase_size = sum(diff[diff > 0], na.rm = T)) %>%
group_by(name, Utt) %>%
summarise(increase_size = toString(increase_size[increase_size > 0])) %>%
mutate(increase_size = na_if(increase_size, ""))
Output
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
I have a data frame with COVID data and I'm trying to make a column calculating the number of recovered people based off of the number of positive tests.
My data has a location, a date, and the number of tests administered/positive results/negative results each day. Here's a few lines using one location as an example (the real data has several months worth of dates):
loc date tests pos neg active
spot1 2020-04-10 1 1 0 5
spot1 2020-04-11 2 1 1 6
spot1 2020-04-12 0 0 0 6
spot1 2020-04-13 11 1 10 7
I want to make a new column that cumulatively counts each positive test in each location 14 days after it is recorded. On 2020-04-24, the 5 active classes are not active anymore, so I want a recovered column with 5. For each date I want the newly nonactive cases to be added.
My first thought was to try it in a loop:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = for (i in 1:nrow(df)) {
#getting number of new cases
x <- df$pos[i]
#add 14 days to the date
d <- df$date + 14
df$rec <- sum(x)
})
As you can see, I'm not the best at writing for loops. That gives me a bunch of numbers, but bear very little meaningful relationship to the data.
Also tried it with map_dbl:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = map_dbl(date, ~sum(pos[(date <= . + 14) & date >= .])))
Which resulted in the same number printed on the entire rec column.
Any suggestions? (Sorry for the lengthy explanation, just want to make sure this all makes sense)
Your sample data shows that -
you have all continuous dates despite 0 tests (12 April)
Active column seems like already a cumsum
Therefore I think you can simply use lag function with argument 14
example code
df %>% group_by(loc) %>% mutate(recovered = lag(active, 14)) %>% ungroup()
You could use aggregate to sum the specific column and then applying
cut in order to set a 14 day time frame for each sum:
df <- data.frame(loc = rep("spot1", 30),
date = seq(as.Date('2020-04-01'), as.Date('2020-04-30'),by = 1),
test = seq(1:30),
positive = seq(1:30),
active = seq(1:30))
output <- aggregate(positive ~ cut(date, "14 days"), df, sum)
output
Console output:
cut(date, "14 days") positive
1 2020-04-01 105
2 2020-04-15 301
3 2020-04-29 59
my solution:
library(dplyr)
date_seq <- seq(as.Date("2020/04/01"), by = "day", length.out = 30)
pos <- rpois(n = 60, lambda = 10)
mydf <-
data.frame(loc = c(rep('loc1', 30), rep('loc2', 30)),
date = date_seq,
pos = pos)
head(mydf)
getPosSum <- function(max, tbl, myloc, daysBack = 14) {
max.Date <- as.Date(max)
sum(tbl %>%
filter(date >= max.Date - (daysBack - 1) &
date <= max.Date & loc == myloc) %>%
select(pos))
}
result <-
mydf %>%
group_by(date, loc) %>%
mutate(rec = getPosSum(max = date, tbl = mydf, myloc = loc))
library(tidyverse)
library(lubridate)
data %>%
mutate(date = as_date(date),
cut = cut(date, '14 days') %>%
group_by(loc) %>%
arrange(cut) %>%
mutate(cum_pos = accumulate(pos, `+`)) # accumulate(pos, sum) should also work
As a general rule of thumb, avoid loops, especially within mutate - that won't work. Instead of map_dbl you should check out purrr::accumulate. There's specialized functions for this in R's base library such as cumsum and cummin but their behavior is a lot less predictable in relation to purrr's.
I want to create jack-knife data partitions for the data frame below, with the partitions to be used in caret::train (like the caret::groupKFold() produces). However, the catch is that I want to restrict the test points to say greater than 16 days, whilst using the remainder of these data as the training set.
df <- data.frame(Effect = seq(from = 0.05, to = 1, by = 0.05),
Time = seq(1:20))
The reason I want to do this is that I am only really interested in how well the model is predicting the upper bound, as this is the region of interest. I feel like there is a way to do this with the caret::groupKFold() function but I am not sure how. Any help would be greatly appreciated.
An example of what each CV fold would comprise:
TrainSet1 <- subset(df, Time != 16)
TestSet1 <- subset(df, Time == 16)
TrainSet2 <- subset(df, Time != 17)
TestSet2 <- subset(df, Time == 17)
TrainSet3 <- subset(df, Time != 18)
TestSet3 <- subset(df, Time == 18)
TrainSet4 <- subset(df, Time != 19)
TestSet4 <- subset(df, Time == 19)
TrainSet5 <- subset(df, Time != 20)
TestSet5 <- subset(df, Time == 20)
Albeit in the format that the caret::groupKFold function outputs, so that the folds could be fed into the caret::train function:
CVFolds <- caret::groupKFold(df$Time)
CVFolds
Thanks in advance!
For customized folds I find in built functions are usually not flexible enough. Therefore I usually produce them using tidyverse. One approach to your problem would be:
library(tidyverse)
df %>%
mutate(id = row_number()) %>% #use the row number as a column called id
filter(Time > 15) %>% #filter Time as per your need
split(.$Time) %>% #split df to a list by Time
map(~ .x %>% select(id)) #select row numbers for each list element
example with two rows per each time:
df <- data.frame(Effect = seq(from = 0.025, to = 1, by = 0.025),
Time = rep(1:20, each = 2))
df %>%
mutate(id = row_number()) %>%
filter(Time > 15) %>%
split(.$Time) %>%
map(~ .x %>% select(id)) -> test_folds
test_folds
#output
$`16`
id
1 31
2 32
$`17`
id
3 33
4 34
$`18`
id
5 35
6 36
$`19`
id
7 37
8 38
$`20`
id
9 39
10 40
with unequal number of rows per time
df <- data.frame(Effect = seq(from = 0.55, to = 1, by = 0.05),
Time = c(rep(1, 5), rep(2, 3), rep(rep(3, 2))))
df %>%
mutate(id = row_number()) %>%
filter(Time > 1) %>%
split(.$Time) %>%
map(~ .x %>% select(id))
$`2`
id
1 6
2 7
3 8
$`3`
id
4 9
5 10
Now you can define these hold out folds inside trainControl with the argument indexOut.
EDIT: to get similar output as caret::groupKFold one can:
df %>%
mutate(id = row_number()) %>%
filter(Time > 1) %>%
split(.$Time) %>%
map(~ .x %>%
select(id) %>%
unlist %>%
unname) %>%
unname
Starting with data containing multiple observations for each group, like this:
set.seed(1)
my.df <- data.frame(
timepoint = rep(c(0, 1, 2), each= 3),
counts = round(rnorm(9, 50, 10), 0)
)
> my.df
timepoint counts
1 0 44
2 0 52
3 0 42
4 1 66
5 1 53
6 1 42
7 2 55
8 2 57
9 2 56
To perform a summary calculation at each timepoint relative to timepoint == 0, for each group I need to pass a vector of counts for timepoint == 0 and a vector of counts for the group (e.g. timepoint == 0) to an arbitrary function, e.g.
NonsenseFunction <- function(x, y){
(mean(x) - mean(y)) / (1 - mean(y))
}
I can get the required output from this table, either with dplyr:
library(dplyr)
my.df %>%
group_by(timepoint) %>%
mutate(rep = paste0("r", 1:n())) %>%
left_join(x = ., y = filter(., timepoint == 0), by = "rep") %>%
group_by(timepoint.x) %>%
summarise(result = NonsenseFunction(counts.x, counts.y))
or data.table:
library(data.table)
my.dt <- data.table(my.df)
my.dt[, rep := paste0("r", 1:length(counts)), by = timepoint]
merge(my.dt, my.dt[timepoint == 0], by = "rep", all = TRUE)[
, NonsenseFunction(counts.x, counts.y), by = timepoint.x]
This only works if the number of observations between groups is the same. Anyway, the observations aren't matched, so using the temporary rep variable seems hacky.
For a more general case, where I need to pass vectors of the baseline values and the group's values to an arbitrary (more complicated) function, is there an idiomatic data.table or dplyr way of doing so with a grouped operation for all groups?
Here's the straightforward data.table approach:
my.dt[, f(counts, my.dt[timepoint==0, counts]), by=timepoint]
This probably grabs my.dt[timepoint==0, counts] again and again, for each group. You could instead save that value ahead of time:
v = my.dt[timepoint==0, counts]
my.dt[, f(counts, v), by=timepoint]
... or if you don't want to add v to the environment, maybe
with(list(v = my.dt[timepoint==0, counts]),
my.dt[, f(counts, v), by=timepoint]
)
You could give the second argument to use the vector from your group of interest as a constant.
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, my.df$counts[my.df$timepoint == 0]))
Or if you want to make it beforehand:
constant = = my.df$counts[my.df$timepoint == 0]
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, constant))
You can try,
library(dplyr)
my.df %>%
mutate(new = mean(counts[timepoint == 0])) %>%
group_by(timepoint) %>%
summarise(result = NonsenseFunction(counts, new))
# A tibble: 3 × 2
# timepoint result
# <dbl> <dbl>
#1 0 0.0000000
#2 1 0.1398601
#3 2 0.2097902
I'm receiving an error in which I believe the root cause is that within my groupings there are not values across all groups.
Data can be downloaded here: https://opendata.miamidade.gov/311/311-Service-Requests-Miami-Dade-County/dj6j-qg5t
What I want to do is to have a function that takes a nested grouping and detects all of the holes and populates zeros. Lets take the following code sample:
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
After the summarise, I need to add a function that goes through every date, and if the case owner does not exist in that date, create the case owner, and add a count of 0.
Here is the code to get to this point:
library("ggvis")
library("magrittr")
library("dplyr")
library("tidyr")
library("shiny")
library("checkpoint")
checkpoint("2016-03-29")
rData <- read.csv("C:\\data\\Miami_311.csv",
header=TRUE,
sep=",")
rDSamp <- rData[sample(1:length(rData$Case.Owner), 1000),]
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
rDSamp$Case.Owner = factor(rDSamp$Case.Owner)
#Convert to known date time
rDSamp$Ticket.Created.Date...Time <-
rDSamp$Ticket.Created.Date...Time %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
FilterDateRange = function(data, feature, minDate, maxDate) {
minDate = minDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
result = subset(data, data[feature] <= maxDate)
subset(result, result[feature] >= minDate)
}
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
For final information, I'm trying to use ggvis layer_smooths and it is reporting na's introduced by coersion, my assumption is holes in the data is causing this.
Found one solution, looking for more generic one...
FillDataHolesWithZeros = function(input){
countZero = input %>%
group_by(Ticket.Created.Date...Time) %>%
summarise(count = n()) %>%
filter(count < length(levels(input$Case.Owner)))
for(i in 1:nrow(countZero))
{
date = countZero[i,]$Ticket.Created.Date...Time
departments = input %>% filter(Ticket.Created.Date...Time == date)
myLevels = levels(input$Case.Owner)
for(j in 1:nrow(departments))
{
owner = departments[j,]$Case.Owner
myLevels = myLevels[myLevels != owner]
}
print(paste(i,":",myLevels))
for(k in 1:length(myLevels)){
input = input %>% rbind(data.frame(
Ticket.Created.Date...Time = date,
Case.Owner = myLevels[k],
count = 0
))
}
}
return(input)
}
Try
for example
DATA
(for future try show reproduceble data and concrete problem)
Date=c(rep("2016-01-01",2),rep("2016-01-02",3),rep("2016-01-03",4))
CaseOwner=c(letters[1:2],letters[1:3],letters[1:4])
CallCount=1:9
dat1=data.frame(Date, CaseOwner, CallCount)
group + add row
library(dplyr)
library(tidyr)
dat1%>%group_by(Date,CaseOwner)%>%summarize(cnt=max(CallCount))%>%complete(CaseOwner, fill = list(cnt = 0))
result
Source: local data frame [12 x 3]
Date CaseOwner cnt
(fctr) (fctr) (dbl)
1 2016-01-01 a 1
2 2016-01-01 b 2
3 2016-01-01 c 0
4 2016-01-01 d 0
5 2016-01-02 a 3
6 2016-01-02 b 4
7 2016-01-02 c 5
8 2016-01-02 d 0
9 2016-01-03 a 6
10 2016-01-03 b 7
11 2016-01-03 c 8
12 2016-01-03 d 9
additional
1) %in% -look pretty then some |
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
Can be changed on
rDSamp = rData[rData$Case.Owner %in%
c("Animal_Services","Waste_Management","Community_Information_and_Outreach","Waste_Management"),]
2) if you want to compare date your not need to convert it to char
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
and
data[feature] <= maxDate
will be compared as string.