count number of occurences in different timeline - r

I have this kind of data.
library(dplyr)
library(tidyverse)
df <- tibble(mydate = as.Date(c("2019-05-11 23:01:00", "2019-05-11 23:02:00", "2019-05-11 23:03:00", "2019-05-11 23:04:00",
"2019-05-12 23:05:00", "2019-05-12 23:06:00", "2019-05-12 23:07:00", "2019-05-12 23:08:00",
"2019-05-13 23:09:00", "2019-05-13 23:10:00", "2019-05-13 23:11:00", "2019-05-13 23:12:00",
"2019-05-14 23:13:00", "2019-05-14 23:14:00", "2019-05-14 23:15:00", "2019-05-14 23:16:00",
"2019-05-15 23:17:00", "2019-05-15 23:18:00", "2019-05-15 23:19:00", "2019-05-15 23:20:00")),
myval = c(0, NA, 1500, 1500,
1500, 1500, NA, 0,
0, 0, 1100, 1100,
1100, 0, 200, 200,
1100, 1100, 1100, 0
))
I want to divide every same value with the counts that it appears. But, if between this number (value 1100) , another number (or NA) appears, and then re-appears (value 1100) , I want to count it separatable.
# just replace values [0,1] with NA
df$myval[df$myval >= 0 & df$myval <= 1] <- NA
df <- df %>%
group_by(myval) %>%
mutate(counts = sum(myval == myval)) %>%
mutate(result = (myval / counts))
Right now the result is:
mydate myval counts result
<date> <dbl> <int> <dbl>
1 2019-05-11 NA NA NA
2 2019-05-11 NA NA NA
3 2019-05-11 1500 4 375
4 2019-05-11 1500 4 375
5 2019-05-12 1500 4 375
6 2019-05-12 1500 4 375
7 2019-05-12 NA NA NA
8 2019-05-12 NA NA NA
9 2019-05-13 NA NA NA
10 2019-05-13 NA NA NA
11 2019-05-13 1100 6 183.
12 2019-05-13 1100 6 183.
13 2019-05-14 1100 6 183.
14 2019-05-14 NA NA NA
15 2019-05-14 200 2 100
16 2019-05-14 200 2 100
17 2019-05-15 1100 6 183.
18 2019-05-15 1100 6 183.
19 2019-05-15 1100 6 183.
20 2019-05-15 NA NA NA
but as you cane see for the value 1100 that appears twice, it count it 6 times.
I want to count it 3 times and then again 3 times.
So, for example value 1500 appears 4 times, so I divide 1500/4.
1100 should be divided by 3 and then again by 3.

You can do that using Run Length Encoding (which is basically a cumulative sum that restarts when it sees another value).
rle(df$myval) %$%
tibble(rle = lengths,
myval = values,
avg = values / rle)
# A tibble: 10 x 3
# rle myval avg
# <int> <dbl> <dbl>
# 1 1 0 0
# 2 1 NA NA
# 3 4 1500 375
# 4 1 NA NA
# 5 3 0 0
# 6 3 1100 367.
# 7 1 0 0
# 8 2 200 100
# 9 3 1100 367.
# 10 1 0 0

Related

Using lag function to find the last value for a specific individual

I'm trying to create a column in my spreadsheet that takes the last recorded value (IC) for a specific individual (by the Datetime column) and populates it into a column (LIC) for the current event.
A sub-sample of my data looks like this (actual dataset has 4949 rows and 37 individuals):
> head(ACdatas.scale)
Date Datetime ID.2 IC LIC
1 2019-05-25 2019-05-25 11:57 139 High NA
2 2019-06-09 2019-06-09 19:42 139 Low NA
3 2019-07-05 2019-07-05 20:12 139 Medium NA
4 2019-07-27 2019-07-27 17:27 152 Low NA
5 2019-08-04 2019-08-04 9:13 152 Medium NA
6 2019-08-04 2019-08-04 16:18 139 Medium NA
I would like to be able to populate the last value from the IC column into the current LIC column for the current event (see below)
> head(ACdatas.scale)
Date Datetime ID.2 IC LIC
1 2019-05-25 2019-05-25 11:57 139 High NA
2 2019-06-09 2019-06-09 19:42 139 Low High
3 2019-07-05 2019-07-05 20:12 139 Medium Low
4 2019-07-27 2019-07-27 17:27 152 Low NA
5 2019-08-04 2019-08-04 9:13 152 Medium Low
6 2019-08-04 2019-08-04 16:18 139 Medium Medium
I've tried the following code:
ACdatas.scale <- ACdatas.scale %>%
arrange(ID.2, Datetime) %>%
group_by(ID.2) %>%
mutate(LIC= lag(IC))
This worked some of the time, but when I checked back through the data, it seemed to have a problem when the date switched, so it could accurately populate the field within the same day, but not when the previous event was on the previous day. Just to make it super confusing, it only had issues with some of the day switches, and not all! Help please!!
Sample data,
dat <- data.frame(id=c(rep("A",5),rep("B",5)), IC=c(1:5,11:15))
dplyr
library(dplyr)
dat %>%
group_by(id) %>%
mutate(LIC = lag(IC)) %>%
ungroup()
# # A tibble: 10 x 3
# id IC LIC
# <chr> <int> <int>
# 1 A 1 NA
# 2 A 2 1
# 3 A 3 2
# 4 A 4 3
# 5 A 5 4
# 6 B 11 NA
# 7 B 12 11
# 8 B 13 12
# 9 B 14 13
# 10 B 15 14
data.table
library(data.table)
as.data.table(dat)[, LIC := shift(IC, type = "lag"), by = .(id)][]
# id IC LIC
# <char> <int> <int>
# 1: A 1 NA
# 2: A 2 1
# 3: A 3 2
# 4: A 4 3
# 5: A 5 4
# 6: B 11 NA
# 7: B 12 11
# 8: B 13 12
# 9: B 14 13
# 10: B 15 14
base R
dat$LIC <- ave(dat$IC, dat$id, FUN = function(z) c(NA, z[-length(z)]))
dat
# id IC LIC
# 1 A 1 NA
# 2 A 2 1
# 3 A 3 2
# 4 A 4 3
# 5 A 5 4
# 6 B 11 NA
# 7 B 12 11
# 8 B 13 12
# 9 B 14 13
# 10 B 15 14
By using your data:
mydat <- structure(list(Date = structure(c(18041, 18056, 18082,
18104, 18112, 18112),
class = "Date"),
Datetime = structure(c(1558760220,1560084120,
1562332320, 1564223220,
1564884780, 1564910280),
class = c("POSIXct","POSIXt"),
tzone = ""),
ID.2 = c(139, 139, 139, 152, 152, 139),
IC = c("High", "Low", "Medium", "Low", "Medium", "Medium"),
LIC = c(NA, NA, NA, NA, NA, NA)), row.names = c(NA, -6L),
class = "data.frame")
mydat %>% arrange(Datetime) %>% group_by(ID.2) %>% mutate(LIC = lag(IC))
# A tibble: 6 x 5
# Groups: ID.2 [2]
Date Datetime ID.2 IC LIC
<date> <dttm> <dbl> <chr> <chr>
1 2019-05-25 2019-05-25 11:57:00 139 High NA
2 2019-06-09 2019-06-09 19:42:00 139 Low High
3 2019-07-05 2019-07-05 20:12:00 139 Medium Low
4 2019-07-27 2019-07-27 17:27:00 152 Low NA
5 2019-08-04 2019-08-04 09:13:00 152 Medium Low
6 2019-08-04 2019-08-04 16:18:00 139 Medium Medium

How to format a number as a percentage and limit number of decimal places

The sample data, manipulations, and such are below. My issue concerns the 4th portion with the lag function into it. The desired result would be to have the empprevyearpct (yes, I know it is actually a qtr) value for area 001 and smb 1 show up as 4.35% instead of .04347826. I have been trying to do so using scales can't figure out how to get the number to the right of decimal and also make it into a percent.
library(readxl)
library(dplyr)
library(data.table)
library(odbc)
library(DBI)
library(stringr)
employment <- c(1,45,125,130,165,260,600,2,46,127,132,167,265,601,50,61,110,121,170,305,55,603,66,112,123,172,310,604)
small <- c(1,1,2,2,3,4,NA,1,1,2,2,3,4,NA,1,1,2,2,3,4,NA,1,1,2,2,3,4,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,2,2,2,2,2,2,2)
smbtest <- data.frame(employment,small,area,year,qtr)
smbtest$smb <-0
smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0
& employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~
"4"))
smbsummary2<-smbtest %>%
mutate(period = paste0(year,"q",qtr)) %>%
select(area,period,employment,smb) %>%
group_by(area,period,smb) %>%
summarise(employment = sum(employment), worksites = n(),
.groups = 'drop_last') %>%
mutate(employment = cumsum(employment),
worksites = cumsum(worksites))
smbsummary2<- smbsummary2%>%
group_by(area,smb)%>%
mutate(empprevyear=lag(employment),
empprevyearpp=employment-empprevyear,
empprevyearpct=((employment/empprevyear)-1))
smblonger2<-smbsummary2 %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value) %>% gt()
You can try percent() from scales package. You can also set digits as follows: percent(x, accuracy = 0.01). Note: 0.1 = 1 digit, 0.01 = 2 digits, 0.001 = 3 digits and so on.
You could use scales::percent()
smbsummary2<- smbsummary2%>%
group_by(area,smb)%>%
mutate(empprevyear=lag(employment),
empprevyearpp=employment-empprevyear,
empprevyearpct=((employment/empprevyear)-1), empprevyearpct=scales::percent(empprevyearpct)
)
Output:
area period smb employment worksites empprevyear empprevyearpp empprevyearpct
<dbl> <chr> <chr> <dbl> <int> <dbl> <dbl> <chr>
1 1 2020q1 1 46 2 NA NA NA
2 1 2020q1 2 301 4 NA NA NA
3 1 2020q1 3 466 5 NA NA NA
4 1 2020q1 4 726 6 NA NA NA
5 1 2020q1 NA 1326 7 NA NA NA
6 1 2020q2 1 48 2 46 2 4%
7 1 2020q2 2 307 4 301 6 2%
8 1 2020q2 3 474 5 466 8 2%
9 1 2020q2 4 739 6 726 13 2%
10 1 2020q2 NA 1340 7 1326 14 1%
11 3 2020q1 1 166 3 NA NA NA
12 3 2020q1 2 397 5 NA NA NA
13 3 2020q1 3 567 6 NA NA NA
14 3 2020q1 4 872 7 NA NA NA
15 3 2020q2 1 66 1 166 -100 -60%
16 3 2020q2 2 301 3 397 -96 -24%
17 3 2020q2 3 473 4 567 -94 -17%
18 3 2020q2 4 783 5 872 -89 -10%
19 3 2020q2 NA 1990 7 NA NA NA

is if na restart count

This is the same as thisquestion but I want to preserve the date. Please read that first.
library(dplyr)
library(tidyverse)
df <- tibble(mydate = as.Date(c("2019-05-11 23:01:00", "2019-05-11 23:02:00", "2019-05-11 23:03:00", "2019-05-11 23:04:00",
"2019-05-12 23:05:00", "2019-05-12 23:06:00", "2019-05-12 23:07:00", "2019-05-12 23:08:00",
"2019-05-13 23:09:00", "2019-05-13 23:10:00", "2019-05-13 23:11:00", "2019-05-13 23:12:00",
"2019-05-14 23:13:00", "2019-05-14 23:14:00", "2019-05-14 23:15:00", "2019-05-14 23:16:00",
"2019-05-15 23:17:00", "2019-05-15 23:18:00", "2019-05-15 23:19:00", "2019-05-15 23:20:00")),
myval = c(0, NA, 1500, 1500,
1500, 1500, NA, 0,
0, 0, 1100, 1100,
1100, 0, 200, 200,
1100, 1100, 1100, 0
))
# just replace values [0,1] with NA
df$myval[df$myval >= 0 & df$myval <= 1] <- NA
df <- df %>%
group_by(myval) %>%
mutate(counts = sum(myval == myval)) %>%
mutate(result = (myval / counts))
Right now the result is:
mydate myval counts result
<date> <dbl> <int> <dbl>
1 2019-05-11 NA NA NA
2 2019-05-11 NA NA NA
3 2019-05-11 1500 4 375
4 2019-05-11 1500 4 375
5 2019-05-12 1500 4 375
6 2019-05-12 1500 4 375
7 2019-05-12 NA NA NA
8 2019-05-12 NA NA NA
9 2019-05-13 NA NA NA
10 2019-05-13 NA NA NA
11 2019-05-13 1100 6 183.
12 2019-05-13 1100 6 183.
13 2019-05-14 1100 6 183.
14 2019-05-14 NA NA NA
15 2019-05-14 200 2 100
16 2019-05-14 200 2 100
17 2019-05-15 1100 6 183.
18 2019-05-15 1100 6 183.
19 2019-05-15 1100 6 183.
20 2019-05-15 NA NA NA
I want to preserve the above dataframe, wth the dates column and the correct result.
I need somehow to restart the counting if after/before a value a NA exists.
So, for 1100 , I must have count 3 two times and not count 6.
You can create groups with data.table rleid :
library(dplyr)
df %>%
group_by(grp = data.table::rleid(myval)) %>%
mutate(counts = n(),
result= myval/counts)
# mydate myval grp counts result
# <date> <dbl> <int> <int> <dbl>
# 1 2019-05-11 NA 1 2 NA
# 2 2019-05-11 NA 1 2 NA
# 3 2019-05-11 1500 2 4 375
# 4 2019-05-11 1500 2 4 375
# 5 2019-05-12 1500 2 4 375
# 6 2019-05-12 1500 2 4 375
# 7 2019-05-12 NA 3 4 NA
# 8 2019-05-12 NA 3 4 NA
# 9 2019-05-13 NA 3 4 NA
#10 2019-05-13 NA 3 4 NA
#11 2019-05-13 1100 4 3 367.
#12 2019-05-13 1100 4 3 367.
#13 2019-05-14 1100 4 3 367.
#14 2019-05-14 NA 5 1 NA
#15 2019-05-14 200 6 2 100
#16 2019-05-14 200 6 2 100
#17 2019-05-15 1100 7 3 367.
#18 2019-05-15 1100 7 3 367.
#19 2019-05-15 1100 7 3 367.
#20 2019-05-15 NA 8 1 NA
With data.table
library(data.table)
setDT(df)[, counts := .N, rleid(myval)][, result := myval/counts]

average in previous group at the same place with another column

I have some data and I am dividing the mdo value by the count number of mdo instances in the previous group.
I am calculating the sog avg also.
But I want to calculate the sog avg that takes place to the same instances as the result (mdo/count) value.
library(dplyr)
library(lubridate)
library(purrr)
df <- tibble(mydate = as.Date(c("2019-05-11 23:01:00", "2019-05-11 23:02:00", "2019-05-11 23:03:00", "2019-05-11 23:04:00",
"2019-05-12 23:05:00", "2019-05-12 23:06:00", "2019-05-12 23:07:00", "2019-05-12 23:08:00",
"2019-05-13 23:09:00", "2019-05-13 23:10:00", "2019-05-13 23:11:00", "2019-05-13 23:12:00",
"2019-05-14 23:13:00", "2019-05-14 23:14:00", "2019-05-14 23:15:00", "2019-05-14 23:16:00",
"2019-05-15 23:17:00", "2019-05-15 23:18:00", "2019-05-15 23:19:00", "2019-05-15 23:20:00",
"2019-05-15 23:21:00", "2019-05-15 23:22:00", "2019-05-15 23:23:00", "2019-05-15 23:24:00",
"2019-05-15 23:25:00")),
mdo = c(1500, 1500, 1500, 1500,
1500, 1500, NA, 0,
0, 0, 900, 900, NA, NA, 1100, 1100,
1100, 200, 200, 200,200,
1100, 1100, 1100, 0
),
sog = c(12, 12, 12, 11, 10,9,
2,8.8, 8.7, 7.8, 11, 11, 12, 11,
9.54, 9.8, 10.4,4, 4, 4.5, 3.6,
7, 8, 9, 0))
df1 <- df %>%
mutate(grp = data.table::rleid(mdo))
df1 <- df1 %>%
#Keep only non-NA value
filter(!is.na(mdo)) %>%
#count occurence of each grp
count(grp, name = 'count') %>%
#Shift the count to the previous group
mutate(count = lag(count)) %>%
#Join with the original data
right_join(df1, by = 'grp') %>%
arrange(grp)
group_mdo <- df1 %>%
select(grp, mdo) %>%
unique() %>%
mutate(prev_mdo = lag(mdo, na.rm=TRUE)) %>%
select(-mdo) %>%
tidyr::fill(prev_mdo, .direction = "down")
df1 <- df1 %>%
left_join(group_mdo, by = "grp") %>%
mutate(result = ifelse(prev_mdo != 0, mdo / count, 0)) %>%
mutate(sog_avg = ifelse(prev_mdo != 0, map_dbl(.x = grp - 1, ~ mean(sog[grp == .x], na.rm=TRUE), na.rm=TRUE), NA))
The result right now is:
grp count mydate mdo sog prev_mdo result sog_avg
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 11 NA NA NA
1 NA 2019-05-12 1500 10 NA NA NA
1 NA 2019-05-12 1500 9 NA NA NA
2 NA 2019-05-12 NA 2 1500 NA 11
3 6 2019-05-12 0 8.8 1500 0 2
3 6 2019-05-13 0 8.7 1500 0 2
3 6 2019-05-13 0 7.8 1500 0 2
4 3 2019-05-13 900 11 0 0 NA
4 3 2019-05-13 900 11 0 0 NA
5 NA 2019-05-14 NA 12 900 NA 11
5 NA 2019-05-14 NA 11 900 NA 11
6 2 2019-05-14 1100 9.54 900 550 11.5
6 2 2019-05-14 1100 9.8 900 550 11.5
6 2 2019-05-15 1100 10.4 900 550 11.5
7 3 2019-05-15 200 4 1100 66.7 9.91
7 3 2019-05-15 200 4 1100 66.7 9.91
7 3 2019-05-15 200 4.5 1100 66.7 9.91
7 3 2019-05-15 200 3.6 1100 66.7 9.91
8 4 2019-05-15 1100 7 200 275 4.03
8 4 2019-05-15 1100 8 200 275 4.03
8 4 2019-05-15 1100 9 200 275 4.03
9 3 2019-05-15 0 0 1100 0 8
My desired result:
grp count mydate mdo sog prev_mdo result sog_avg
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 12 NA NA NA
1 NA 2019-05-11 1500 11 NA NA NA
1 NA 2019-05-12 1500 10 NA NA NA
1 NA 2019-05-12 1500 9 NA NA NA
2 NA 2019-05-12 NA 2 1500 NA NA
3 6 2019-05-12 0 8.8 1500 0 0
3 6 2019-05-13 0 8.7 1500 0 0
3 6 2019-05-13 0 7.8 1500 0 0
4 3 2019-05-13 900 11 0 0 0
4 3 2019-05-13 900 11 0 0 0
5 NA 2019-05-14 NA 12 900 NA NA
5 NA 2019-05-14 NA 11 900 NA NA
6 2 2019-05-14 1100 9.54 900 550 11
6 2 2019-05-14 1100 9.8 900 550 11
6 2 2019-05-15 1100 10.4 900 550 11
7 3 2019-05-15 200 4 1100 66.7 9.91
7 3 2019-05-15 200 4 1100 66.7 9.91
7 3 2019-05-15 200 4.5 1100 66.7 9.91
7 3 2019-05-15 200 3.6 1100 66.7 9.91
8 4 2019-05-15 1100 7 200 275 4.03
8 4 2019-05-15 1100 8 200 275 4.03
8 4 2019-05-15 1100 9 200 275 4.03
9 3 2019-05-15 0 0 1100 0 0
Where result is zero, sog_avg should be zero, where result is na, sog avg should be na.
And where result is being computed by using the previous group counts, sog avg should be computed with it's previous values.
So, for example:
mdo = 1100 , result is 550 because counts in previous non null group are 2 (mdo value 900).
1100 / 2 = 550 . At this point sog avg should be (11 + 11) / 2 = 11 because counts were 2 in the previous non null group.
Here is a data.table approach. It extensively uses the idea of making groups by using base table or tapply and then lags those results. Note, this answer would fail if mdo is not constant throughout a group.
library(data.table)
dt = as.data.table(df)
dt[, grp := rleid(mdo)]
dt[!is.na(mdo),
count := {
cnt = table(grp)
rep(shift(cnt), cnt)
}
]
setcolorder(dt, c("grp", "count", "mydate", "mdo", "sog"))
dt[,
prev_mdo := {
ord = table(grp)
nafill(rep(shift(mdo[cumsum(ord)]), ord), "locf")
}
]
dt[, result := fifelse(prev_mdo != 0L, mdo / count, 0)]
dt[!is.na(result),
sog_avg := {
mn = tapply(sog, grp, mean)
rep(shift(mn), table(grp))
}]
dt[result == 0 | is.na(result), sog_avg := result]
dt
#> grp count mydate mdo sog prev_mdo result sog_avg
#> 1: 1 NA 2019-05-11 1500 12.00 NA NA NA
#> 2: 1 NA 2019-05-11 1500 12.00 NA NA NA
#> 3: 1 NA 2019-05-11 1500 12.00 NA NA NA
#> 4: 1 NA 2019-05-11 1500 11.00 NA NA NA
#> 5: 1 NA 2019-05-12 1500 10.00 NA NA NA
#> 6: 1 NA 2019-05-12 1500 9.00 NA NA NA
#> 7: 2 NA 2019-05-12 NA 2.00 1500 NA NA
#> 8: 3 6 2019-05-12 0 8.80 1500 0.00000 0.000000
#> 9: 3 6 2019-05-13 0 8.70 1500 0.00000 0.000000
#> 10: 3 6 2019-05-13 0 7.80 1500 0.00000 0.000000
#> 11: 4 3 2019-05-13 900 11.00 0 0.00000 0.000000
#> 12: 4 3 2019-05-13 900 11.00 0 0.00000 0.000000
#> 13: 5 NA 2019-05-14 NA 12.00 900 NA NA
#> 14: 5 NA 2019-05-14 NA 11.00 900 NA NA
#> 15: 6 2 2019-05-14 1100 9.54 900 550.00000 11.000000
#> 16: 6 2 2019-05-14 1100 9.80 900 550.00000 11.000000
#> 17: 6 2 2019-05-15 1100 10.40 900 550.00000 11.000000
#> 18: 7 3 2019-05-15 200 4.00 1100 66.66667 9.913333
#> 19: 7 3 2019-05-15 200 4.00 1100 66.66667 9.913333
#> 20: 7 3 2019-05-15 200 4.50 1100 66.66667 9.913333
#> 21: 7 3 2019-05-15 200 3.60 1100 66.66667 9.913333
#> 22: 8 4 2019-05-15 1100 7.00 200 275.00000 4.025000
#> 23: 8 4 2019-05-15 1100 8.00 200 275.00000 4.025000
#> 24: 8 4 2019-05-15 1100 9.00 200 275.00000 4.025000
#> 25: 9 3 2019-05-15 0 0.00 1100 0.00000 0.000000
#> grp count mydate mdo sog prev_mdo result sog_avg

count rows from every previous value

This question is the same as here but this time I want to divide every value by the previous count, not itself. So, for the first value (1500) we will have NA because there is no other value before that. Then, we will divide 1100 by 4 because the count of previous value (1500) is 4. Then, we will divide 200 by 3 because the previous value (1100) has count 3. Last, divide 1100 by 2 because 200 has count 2. I tried to use shift/lag but can't succeed!
This is the code that divides every value with its own count.
library(dplyr)
library(tidyverse)
df <- tibble(mydate = as.Date(c("2019-05-11 23:01:00", "2019-05-11 23:02:00", "2019-05-11 23:03:00", "2019-05-11 23:04:00",
"2019-05-12 23:05:00", "2019-05-12 23:06:00", "2019-05-12 23:07:00", "2019-05-12 23:08:00",
"2019-05-13 23:09:00", "2019-05-13 23:10:00", "2019-05-13 23:11:00", "2019-05-13 23:12:00",
"2019-05-14 23:13:00", "2019-05-14 23:14:00", "2019-05-14 23:15:00", "2019-05-14 23:16:00",
"2019-05-15 23:17:00", "2019-05-15 23:18:00", "2019-05-15 23:19:00", "2019-05-15 23:20:00")),
myval = c(0, NA, 1500, 1500,
1500, 1500, NA, 0,
0, 0, 1100, 1100,
1100, 0, 200, 200,
1100, 1100, 1100, 0
))
# just replace values [0,1] with NA
df$myval[df$myval >= 0 & df$myval <= 1] <- NA
df <- df %>%
group_by(grp = data.table::rleid(myval)) %>%
mutate(counts = n(),
result= myval/counts)
# mydate myval grp counts result
# <date> <dbl> <int> <int> <dbl>
# 1 2019-05-11 NA 1 2 NA
# 2 2019-05-11 NA 1 2 NA
# 3 2019-05-11 1500 2 4 375
# 4 2019-05-11 1500 2 4 375
# 5 2019-05-12 1500 2 4 375
# 6 2019-05-12 1500 2 4 375
# 7 2019-05-12 NA 3 4 NA
# 8 2019-05-12 NA 3 4 NA
# 9 2019-05-13 NA 3 4 NA
#10 2019-05-13 NA 3 4 NA
#11 2019-05-13 1100 4 3 367.
#12 2019-05-13 1100 4 3 367.
#13 2019-05-14 1100 4 3 367.
#14 2019-05-14 NA 5 1 NA
#15 2019-05-14 200 6 2 100
#16 2019-05-14 200 6 2 100
#17 2019-05-15 1100 7 3 367.
#18 2019-05-15 1100 7 3 367.
#19 2019-05-15 1100 7 3 367.
#20 2019-05-15 NA 8 1 NA
I want to preserve the above dataframe, with the dates column and the correct result.
Here is one way :
library(dplyr)
#Create a group number
df1 <- df %>% mutate(grp = data.table::rleid(myval))
df1 %>%
#Keep only non-NA value
filter(!is.na(myval)) %>%
#count occurence of each grp
count(grp, name = 'count') %>%
#Shift the count to the previous group
mutate(count = lag(count)) %>%
#Join with the original data
right_join(df1, by = 'grp') %>%
#divide the count to get final result
mutate(result = myval/count) %>%
arrange(grp)
which returns
# A tibble: 20 x 5
# grp count mydate myval result
# <int> <int> <date> <dbl> <dbl>
# 1 1 NA 2019-05-11 NA NA
# 2 1 NA 2019-05-11 NA NA
# 3 2 NA 2019-05-11 1500 NA
# 4 2 NA 2019-05-11 1500 NA
# 5 2 NA 2019-05-12 1500 NA
# 6 2 NA 2019-05-12 1500 NA
# 7 3 NA 2019-05-12 NA NA
# 8 3 NA 2019-05-12 NA NA
# 9 3 NA 2019-05-13 NA NA
#10 3 NA 2019-05-13 NA NA
#11 4 4 2019-05-13 1100 275
#12 4 4 2019-05-13 1100 275
#13 4 4 2019-05-14 1100 275
#14 5 NA 2019-05-14 NA NA
#15 6 3 2019-05-14 200 66.7
#16 6 3 2019-05-14 200 66.7
#17 7 2 2019-05-15 1100 550
#18 7 2 2019-05-15 1100 550
#19 7 2 2019-05-15 1100 550
#20 8 NA 2019-05-15 NA NA

Resources