R - fill value based on conditions in Person Period Format - r

I am struggling to find a simple way to fill values based on two simple conditions.
I am trying to fill the variable working with 1 after the first and last "1" for each dayweek. The example is more telling.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 0
10 1 10 Friday 0
11 1 11 Friday 0
12 1 12 Friday 0
13 1 13 Friday 0
14 1 14 Friday 0
15 1 15 Friday 0
16 1 16 Friday 0
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
I am trying to do this.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 1
10 1 10 Friday 1
11 1 11 Friday 1
12 1 12 Friday 1
13 1 13 Friday 1
14 1 14 Friday 1
15 1 15 Friday 1
16 1 16 Friday 1
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
The group_by must be id and dayweek.
Any clue ?
The data
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), hours = 1:20, dayweek = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Friday", "Monday", "Saturday", "Sunday",
"Thursday", "Tuesday", "Wedesnday"), class = "factor"), working = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)), row.names = c(NA,
20L), class = "data.frame", .Names = c("id", "hours", "dayweek",
"working"))
alternative data of same problem
dt = structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 29L, 30L,
31L, 32L, 33L, 34L, 35L, 36L, 57L, 58L, 59L, 60L, 61L, 62L, 63L,
64L), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), hours = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), dayweek = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L), .Label = c("Friday", "Monday", "Saturday",
"Sunday", "Thursday", "Tuesday", "Wedesnday"), class = "factor"),
working = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-24L), .Names = c("X", "id", "hours", "dayweek", "working"))

We could use data.table to do this. We convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by the 'id' and 'dayweek', we get the numeric index of elements in 'working' which is equal to 1 ('tmp') on if there is atleast an 1 value in the group (if(any(working==1))). Get the sequence (:) between first (head(tmp,1)) and last (tail(tmp, 1)) position and wrap it with .I to get the row index ('i1'). Use the index and assign the 'working' elements corresponding to that row as 1.
library(data.table)
i1 <- setDT(df1)[, if(any(working==1)){tmp <- which(working==1)
.I[head(tmp,1):tail(tmp,1)]} , by = .(id, dayweek)]$V1
df1[i1, working:=1L]
df1
# id hours dayweek working
# 1: 1 1 Friday 0
# 2: 1 2 Friday 0
# 3: 1 3 Friday 0
# 4: 1 4 Friday 0
# 5: 1 5 Friday 0
# 6: 1 6 Friday 0
# 7: 1 7 Friday 0
# 8: 1 8 Friday 1
# 9: 1 9 Friday 1
#10: 1 10 Friday 1
#11: 1 11 Friday 1
#12: 1 12 Friday 1
#13: 1 13 Friday 1
#14: 1 14 Friday 1
#15: 1 15 Friday 1
#16: 1 16 Friday 1
#17: 1 17 Friday 1
#18: 1 18 Friday 0
#19: 1 19 Friday 0
#20: 1 20 Friday 0
Or a similar solution using dplyr (as suggested by #David Arenburg) would be to group by 'id', 'dayweek' columns, use the min and max to get the first and last positions where working == 1, and replace those elements in working with 1. If there are no 1 value for a particular group, we can wrap with ifelse to return 0 for those group.
library(dplyr)
df1 %>%
group_by(id, dayweek) %>%
mutate(new = any(working ==1),
working = ifelse(new, replace(working,
min(which(working == 1)):max(which(working == 1)), 1),
as.numeric(new))) %>%
select(-new)
#Source: local data frame [20 x 4]
#Groups: id, dayweek
#
# id hours dayweek working
#1 1 1 Friday 0
#2 1 2 Friday 0
#3 1 3 Friday 0
#4 1 4 Friday 0
#5 1 5 Friday 0
#6 1 6 Friday 0
#7 1 7 Friday 0
#8 1 8 Friday 1
#9 1 9 Friday 1
#10 1 10 Friday 1
#11 1 11 Friday 1
#12 1 12 Friday 1
#13 1 13 Friday 1
#14 1 14 Friday 1
#15 1 15 Friday 1
#16 1 16 Friday 1
#17 1 17 Friday 1
#18 1 18 Friday 0
#19 1 19 Friday 0
#20 1 20 Friday 0
Or a compact option suggested by #Khashaa where we multiply the cummax of 'working' with cummax of reverse (rev) of 'working' so that only elements that are 1 in both the vectors remain as 1 while others will be replaced by 0.
df1 %>%
group_by(id, dayweek) %>%
mutate(working = cummax(working)*rev(cummax(rev(working))))

Related

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Randomly pick a value from a set of rows and add value to new row below

my R skills are not sufficient to tackle this issue so I'm hoping someone can help.
My data look like this:
head(human.players,25)
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
5
1
1
2
1
1
0
0
NA
2020
1
5
2
1
2
1
1
1
0
NA
2020
1
5
3
1
2
NA
NA
NA
NA
NA
2020
1
6
1
1
2
0
0
0
0
NA
2020
1
6
2
1
2
0
0
0
0
NA
2020
1
6
3
1
2
NA
NA
NA
NA
NA
2020
1
7
1
1
2
0
1
1
1
NA
2020
1
7
2
1
2
1
0
0
1
NA
2020
1
7
3
1
2
NA
NA
NA
NA
NA
2020
2
1
1
1
3
1
1
0
0
NA
2020
2
1
2
1
3
0
0
0
1
NA
2020
2
1
3
1
3
0
1
1
0
NA
Vars from q1_a:q5_a represent whether a player got the question wrong (0) or correct (1). Each player plays in a specific round (there are 7 rounds per episode). In the first 4 rounds, there are 3 players. However, in rounds 5-7, there are only 2 players (the one player that is eliminated has NA's - e.g., in episode 1, this is player 3 - see table above).
I need to create a random player. This means that for the first 4 rounds I need to randomly select an answer (for each of the 5 questions) from the three players within that round and add the "random player" row value. For rounds 5 to 7 I need to select an answer from the two players (ignoring the NA) and add the "random player" row value.
An algorithm of sorts would have to look at round 1 (only those rows), sample one value from the three rows, paste it into round 1 (i.e., create row 4 in this example ) and do that for each of the 5 questions. Then for round 2...
This is how it is supposed to look like where I've added player 4 - the random player:
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
1
4
1
3
0
0
1
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
2
4
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
3
4
1
3
0
0
0
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
4
4
1
3
0
0
1
1
NA
Writing this, I'm thinking it may be impossible or at least very difficult to do this so this question is more like a "hail mary". I presume some combination of sample(), apply(), and creating a custom function is necessary, but I'm stumped.
Here's one pipe that samples the new players and their scores into a separate frame, which you can then bind_rows back into the original data.
set.seed(2021)
newplayers <- dat %>%
filter(!is.na(q1_a)) %>%
group_by(Season, Episode, Round) %>%
summarize(across(everything(), ~ sample(., size=1)), .groups = "drop") %>%
mutate(Player = NA_integer_, Player_type = NA_integer_)
newplayers
# # A tibble: 8 x 11
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <lgl>
# 1 2020 1 1 NA NA 3 0 0 1 1 NA
# 2 2020 1 2 NA NA 3 1 1 0 0 NA
# 3 2020 1 3 NA NA 3 0 1 1 0 NA
# 4 2020 1 4 NA NA 3 0 0 1 1 NA
# 5 2020 1 5 NA NA 2 1 1 1 0 NA
# 6 2020 1 6 NA NA 2 0 0 0 0 NA
# 7 2020 1 7 NA NA 2 0 0 1 1 NA
# 8 2020 2 1 NA NA 3 0 1 0 0 NA
bind_rows(dat, newplayers) %>%
arrange(Season, Episode, Round, is.na(Player), Player) %>%
head(.)
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# 1 2020 1 1 1 1 3 0 1 0 0 NA
# 2 2020 1 1 2 1 3 0 1 1 1 NA
# 3 2020 1 1 3 1 3 0 0 0 1 NA
# 4 2020 1 1 NA NA 3 0 0 1 1 NA
# 5 2020 1 2 1 1 3 1 1 0 1 NA
# 6 2020 1 2 2 1 3 1 0 1 0 NA
I didn't know what values to put into Player*, so I chose NA.
Data
# dput(dat)
dat <- structure(list(Season = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L), Episode = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), Round = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L), Player = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Player_type = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Crowd_size = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), q1_a = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 0L, 1L, NA, 1L, 0L, 0L), q2_a = c(1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 1L, 0L, 1L), q3_a = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 0L, 0L, 1L), q4_a = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, NA, 0L, 0L, NA, 1L, 1L, NA, 0L, 1L, 0L), q5_a = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, -24L))

put a 0 after not buying in a quarter in r

I have a base with the variables ID, month (or period) and the incomes of that month. What I need is to put a 1 if the client buys in the next 3 months or a 0 if not, and do it for all ID.
For example, if I am in month 1 and there's a purchase in the next 3 months, then put a 1 in that row for that client.
In the last periods as there will not be 3 months, an NA appears.
df<-tibble::tribble(
~ID, ~Month, ~Incomes,
1L, 1L, 5000L,
1L, 2L, 0L,
1L, 3L, 0L,
1L, 4L, 0L,
1L, 5L, 0L,
1L, 6L, 0L,
1L, 7L, 400L,
1L, 8L, 300L,
1L, 9L, 0L,
1L, 10L, 0L,
1L, 11L, 0L,
1L, 12L, 0L,
1L, 13L, 400L,
2L, 1L, 0L,
2L, 2L, 100L,
2L, 3L, 0L,
2L, 4L, 0L,
2L, 5L, 0L,
2L, 6L, 0L,
2L, 7L, 0L,
2L, 8L, 1500L,
2L, 9L, 0L,
2L, 10L, 0L,
2L, 11L, 0L,
2L, 12L, 100L,
2L, 13L, 750L,
3L, 1L, 0L,
3L, 2L, 0L,
3L, 3L, 0L,
3L, 4L, 0L,
3L, 5L, 700L,
3L, 6L, 240L,
3L, 7L, 100L,
3L, 8L, 0L,
3L, 9L, 0L,
3L, 10L, 0L,
3L, 11L, 0L,
3L, 12L, 500L,
3L, 13L, 760L
)
df<-as.data.frame(df)
# ID Month Incomes
# 1 1 5000
# 1 2 0
# 1 3 0
# 1 4 0
# 1 5 0
# 1 6 0
# 1 7 400
# 1 8 300
# 1 9 0
# 1 10 0
# 1 11 0
# 1 12 0
# 1 13 400
# 2 1 0
# 2 2 100
# 2 3 0
# 2 4 0
# 2 5 0
# 2 6 0
# 2 7 0
# 2 8 1500
# 2 9 0
# 2 10 0
# 2 11 0
# 2 12 100
# 2 13 750
# 3 1 0
# 3 2 0
# 3 3 0
# 3 4 0
# 3 5 700
# 3 6 240
# 3 7 100
# 3 8 0
# 3 9 0
# 3 10 0
# 3 11 0
# 3 12 500
# 3 13 760
What I hope should look like this:
dffinal<- tibble::tribble(
~ID_RUT, ~Month, ~Incomes, ~Quarter,
1L, 1L, 5000L, 0L,
1L, 2L, 0L, 0L,
1L, 3L, 0L, 0L,
1L, 4L, 0L, 1L,
1L, 5L, 0L, 1L,
1L, 6L, 0L, 1L,
1L, 7L, 400L, 1L,
1L, 8L, 300L, 0L,
1L, 9L, 0L, 0L,
1L, 10L, 0L, 0L,
1L, 11L, 0L, NA,
1L, 12L, 0L, NA,
1L, 13L, 400L, NA,
2L, 1L, 0L, 1L,
2L, 2L, 100L, 0L,
2L, 3L, 0L, 0L,
2L, 4L, 0L, 0L,
2L, 5L, 0L, 1L,
2L, 6L, 0L, 1L,
2L, 7L, 0L, 1L,
2L, 8L, 1500L, 0L,
2L, 9L, 0L, 1L,
2L, 10L, 0L, 1L,
2L, 11L, 0L, NA,
2L, 12L, 100L, NA,
2L, 13L, 750L, NA,
3L, 1L, 0L, 0L,
3L, 2L, 0L, 1L,
3L, 3L, 0L, 1L,
3L, 4L, 0L, 1L,
3L, 5L, 700L, 1L,
3L, 6L, 240L, 1L,
3L, 7L, 100L, 0L,
3L, 8L, 0L, 0L,
3L, 9L, 0L, 1L,
3L, 10L, 0L, 1L,
3L, 11L, 0L, NA,
3L, 12L, 500L, NA,
3L, 13L, 760L, NA
)
# ID Month Incomes Quarterly
# 1 1 5000 0
# 1 2 0 0
# 1 3 0 0
# 1 4 0 1
# 1 5 0 1
# 1 6 0 1
# 1 7 400 1
# 1 8 300 0
# 1 9 0 0
# 1 10 0 0
# 1 11 0 NA
# 1 12 0 NA
# 1 13 400 NA
# 2 1 0 1
# 2 2 100 0
# 2 3 0 0
# 2 4 0 0
# 2 5 0 1
# 2 6 0 1
# 2 7 0 1
# 2 8 1500 0
# 2 9 0 1
# 2 10 0 1
# 2 11 0 NA
# 2 12 100 NA
# 2 13 750 NA
# 3 1 0 0
# 3 2 0 1
# 3 3 0 1
# 3 4 0 1
# 3 5 700 1
# 3 6 240 1
# 3 7 100 0
# 3 8 0 0
# 3 9 0 1
# 3 10 0 1
# 3 11 0 NA
# 3 12 500 NA
# 3 13 760 NA
Does anyone how to do it? Thanks for your time
1) rollapply Roll forward along Incomes > 0 returning TRUE if any are TRUE and FALSE otherwise. Convert that to numeric using +. 1:3 means use offsets 1, 2, 3 from the current point, i.e. the next three incomes. Add the partial=TRUE argument to rollapply if you want to consider the next and next two incomes near the end of each group where there are not three left.
library(dplyr)
library(zoo)
df %>%
group_by(ID) %>%
mutate(Quarter = +rollapply(Incomes > 0, list(1:3), any, fill = NA)) %>%
ungroup
2) SQL An SQL solution would be:
library(sqldf)
over <- "partition by ID rows between 1 following and 3 following"
fn$sqldf("select
*,
(max(Incomes > 0) over ($over)) +
(case when (count(*) over ($over)) = 3 then 0 else Null end) as Quarter
from df")
This can be simplified if it is OK to process elements for which there are fewer than 3 rows following. over is from above:
fn$sqldf("select *, (max(Incomes > 0) over ($over)) as Quarter from df")
A dplyr solution: sum the next three months using lag and take the sign of the result.
df %>%
group_by(ID) %>%
mutate(quarter = sign(lead(Incomes, 3) + lead(Incomes, 2) + lead(Incomes))) %>%
as.data.frame()
#> ID Month Incomes quarter
#> 1 1 1 5000 0
#> 2 1 2 0 0
#> 3 1 3 0 0
#> 4 1 4 0 1
#> 5 1 5 0 1
#> 6 1 6 0 1
#> 7 1 7 400 1
#> 8 1 8 300 0
#> 9 1 9 0 0
#> 10 1 10 0 1
#> 11 1 11 0 NA
#> 12 1 12 0 NA
#> 13 1 13 400 NA
#> 14 2 1 0 1
#> 15 2 2 100 0
#> 16 2 3 0 0
#> 17 2 4 0 0
#> 18 2 5 0 1
#> 19 2 6 0 1
#> 20 2 7 0 1
#> 21 2 8 1500 0
#> 22 2 9 0 1
#> 23 2 10 0 1
#> 24 2 11 0 NA
#> 25 2 12 100 NA
#> 26 2 13 750 NA
#> 27 3 1 0 0
#> 28 3 2 0 1
#> 29 3 3 0 1
#> 30 3 4 0 1
#> 31 3 5 700 1
#> 32 3 6 240 1
#> 33 3 7 100 0
#> 34 3 8 0 0
#> 35 3 9 0 1
#> 36 3 10 0 1
#> 37 3 11 0 NA
#> 38 3 12 500 NA
#> 39 3 13 760 NA
Another option:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(
Quarterly = c(
sapply(1:(n() - 3), function(x) +any(Incomes[(x + 1):(x + 3)] > 0)),
rep(NA, 3)
)
) %>% as.data.frame
Output:
ID Month Incomes Quarterly
1 1 1 5000 0
2 1 2 0 0
3 1 3 0 0
4 1 4 0 1
5 1 5 0 1
6 1 6 0 1
7 1 7 400 1
8 1 8 300 0
9 1 9 0 0
10 1 10 0 1
11 1 11 0 NA
12 1 12 0 NA
13 1 13 400 NA
14 2 1 0 1
15 2 2 100 0
16 2 3 0 0
17 2 4 0 0
18 2 5 0 1
19 2 6 0 1
20 2 7 0 1
21 2 8 1500 0
22 2 9 0 1
23 2 10 0 1
24 2 11 0 NA
25 2 12 100 NA
26 2 13 750 NA
27 3 1 0 0
28 3 2 0 1
29 3 3 0 1
30 3 4 0 1
31 3 5 700 1
32 3 6 240 1
33 3 7 100 0
34 3 8 0 0
35 3 9 0 1
36 3 10 0 1
37 3 11 0 NA
38 3 12 500 NA
39 3 13 760 NA
And a base equivalent:
transform(df, Quarterly = ave(Incomes, ID,
FUN = function(x) c(
sapply(1:(length(x) - 3), function(y) +any(x[(y + 1):(y + 3)] > 0)),
rep(NA, 3)
)
)
)

Mutiple covariance matrix with high-frecuency data using R

I'm working with high-frequency data which contains 2 million observations. Now, I need to calculate the daily Realized Covariance matrix, defined as:
Let be an intraday prices vector and the number of intraday prices
My data has the following structure:
Year Month Day FivMin A B C D
2000 1 1 1 1 2 3 4
2000 1 1 2 2 3 0 1
2000 1 1 3 3 4 1 2
2000 1 1 4 0 1 2 3
2000 1 2 1 1 2 3 4
2000 1 2 2 5 3 4 1
2000 1 2 3 3 0 1 2
2000 1 2 4 4 1 9 3
2000 1 3 1 1 2 3 4
2000 1 3 2 0 1 7 1
2000 1 3 3 3 4 1 2
2000 1 3 4 1 -2 2 3
2000 1 4 1 0 2 3 4
2000 1 4 2 2 1 4 1
2000 1 4 3 3 0 1 2
2000 1 4 4 0 2 2 3
2000 1 5 1 1 2 3 4
2000 1 5 2 2 3 4 1
2000 1 5 3 0 -1 1 2
2000 1 5 4 9 1 2 3
Variables A, B, C, and D represent prices recorded each five minutes. So my first idea is to use group_by with the variables Year, Month, and Day in order to create the matrix. After this, I need to calculate the Realized Cov for each day.
For example, for the first day my Realized Cov would be:
this operation has to be repeated every day. I do not if there is a package for this problem or not. Maybe, it is better to use a loop.
Thanks for helping me.
Here is a base R solution using split (for grouping) + tcrossprod (for cov matrix)
res <- lapply(split(df,df[c("Year","Month","Day")]),
function(x) tcrossprod(t(x[c("A","B","C","D")])))
such that
> res
$`2000.1.1`
A B C D
A 14 20 6 12
B 20 30 12 22
C 6 12 14 20
D 12 22 20 30
$`2000.1.2`
A B C D
A 51 21 62 27
B 21 14 27 14
C 62 27 107 45
D 27 14 45 30
$`2000.1.3`
A B C D
A 11 12 8 13
B 12 25 13 11
C 8 13 63 27
D 13 11 27 30
$`2000.1.4`
A B C D
A 13 2 11 8
B 2 9 14 15
C 11 14 30 24
D 8 15 24 30
$`2000.1.5`
A B C D
A 86 17 29 33
B 17 15 19 12
C 29 19 30 24
D 33 12 24 30
DATA
df <- structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L), Month = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Day = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L), FivMin = c(1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), A = c(1L,
2L, 3L, 0L, 1L, 5L, 3L, 4L, 1L, 0L, 3L, 1L, 0L, 2L, 3L, 0L, 1L,
2L, 0L, 9L), B = c(2L, 3L, 4L, 1L, 2L, 3L, 0L, 1L, 2L, 1L, 4L,
-2L, 2L, 1L, 0L, 2L, 2L, 3L, -1L, 1L), C = c(3L, 0L, 1L, 2L,
3L, 4L, 1L, 9L, 3L, 7L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L
), D = c(4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA,
-20L))

count frequency based on values in 2 or more columns

I have a pretty simple question but I can't think of a way to do this without using if statements
The data I have looks something like:
df <- structure(list(years = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), id = c(1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), x = structure(c(2L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L), .Label = c("E", "I"), class = "factor")), .Names = c("years",
"id", "x"), class = "data.frame", row.names = c(NA, -18L))
so the table looks like:
years id x
1 1 1 I
2 2 1 E
3 3 1 E
4 1 1 E
5 2 1 I
6 3 1 I
7 1 2 I
8 2 2 E
9 3 2 I
10 1 2 E
11 2 2 E
12 3 2 I
13 1 3 I
14 2 3 E
15 3 3 I
16 1 3 I
17 2 3 I
18 3 3 E
I would like the output to report the fraction of x's that are "I" for each id and each year:
years id xnew
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5
Any help would be greatly appreciated! Thank you!
aggregate(x ~ years + id, data=df, function(y) sum(y=="I")/length(y) )
years id x
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5

Resources