count frequency based on values in 2 or more columns - r

I have a pretty simple question but I can't think of a way to do this without using if statements
The data I have looks something like:
df <- structure(list(years = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), id = c(1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), x = structure(c(2L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L), .Label = c("E", "I"), class = "factor")), .Names = c("years",
"id", "x"), class = "data.frame", row.names = c(NA, -18L))
so the table looks like:
years id x
1 1 1 I
2 2 1 E
3 3 1 E
4 1 1 E
5 2 1 I
6 3 1 I
7 1 2 I
8 2 2 E
9 3 2 I
10 1 2 E
11 2 2 E
12 3 2 I
13 1 3 I
14 2 3 E
15 3 3 I
16 1 3 I
17 2 3 I
18 3 3 E
I would like the output to report the fraction of x's that are "I" for each id and each year:
years id xnew
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5
Any help would be greatly appreciated! Thank you!

aggregate(x ~ years + id, data=df, function(y) sum(y=="I")/length(y) )
years id x
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5

Related

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Mutiple covariance matrix with high-frecuency data using R

I'm working with high-frequency data which contains 2 million observations. Now, I need to calculate the daily Realized Covariance matrix, defined as:
Let be an intraday prices vector and the number of intraday prices
My data has the following structure:
Year Month Day FivMin A B C D
2000 1 1 1 1 2 3 4
2000 1 1 2 2 3 0 1
2000 1 1 3 3 4 1 2
2000 1 1 4 0 1 2 3
2000 1 2 1 1 2 3 4
2000 1 2 2 5 3 4 1
2000 1 2 3 3 0 1 2
2000 1 2 4 4 1 9 3
2000 1 3 1 1 2 3 4
2000 1 3 2 0 1 7 1
2000 1 3 3 3 4 1 2
2000 1 3 4 1 -2 2 3
2000 1 4 1 0 2 3 4
2000 1 4 2 2 1 4 1
2000 1 4 3 3 0 1 2
2000 1 4 4 0 2 2 3
2000 1 5 1 1 2 3 4
2000 1 5 2 2 3 4 1
2000 1 5 3 0 -1 1 2
2000 1 5 4 9 1 2 3
Variables A, B, C, and D represent prices recorded each five minutes. So my first idea is to use group_by with the variables Year, Month, and Day in order to create the matrix. After this, I need to calculate the Realized Cov for each day.
For example, for the first day my Realized Cov would be:
this operation has to be repeated every day. I do not if there is a package for this problem or not. Maybe, it is better to use a loop.
Thanks for helping me.
Here is a base R solution using split (for grouping) + tcrossprod (for cov matrix)
res <- lapply(split(df,df[c("Year","Month","Day")]),
function(x) tcrossprod(t(x[c("A","B","C","D")])))
such that
> res
$`2000.1.1`
A B C D
A 14 20 6 12
B 20 30 12 22
C 6 12 14 20
D 12 22 20 30
$`2000.1.2`
A B C D
A 51 21 62 27
B 21 14 27 14
C 62 27 107 45
D 27 14 45 30
$`2000.1.3`
A B C D
A 11 12 8 13
B 12 25 13 11
C 8 13 63 27
D 13 11 27 30
$`2000.1.4`
A B C D
A 13 2 11 8
B 2 9 14 15
C 11 14 30 24
D 8 15 24 30
$`2000.1.5`
A B C D
A 86 17 29 33
B 17 15 19 12
C 29 19 30 24
D 33 12 24 30
DATA
df <- structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L), Month = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Day = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L), FivMin = c(1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), A = c(1L,
2L, 3L, 0L, 1L, 5L, 3L, 4L, 1L, 0L, 3L, 1L, 0L, 2L, 3L, 0L, 1L,
2L, 0L, 9L), B = c(2L, 3L, 4L, 1L, 2L, 3L, 0L, 1L, 2L, 1L, 4L,
-2L, 2L, 1L, 0L, 2L, 2L, 3L, -1L, 1L), C = c(3L, 0L, 1L, 2L,
3L, 4L, 1L, 9L, 3L, 7L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L
), D = c(4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA,
-20L))

Dataframe column to matrix by two other columns

I have a dataframe
df<-data.frame(i=rep(1:3,3),j=sort(rep(1:3,3)),v=sample(1:9,9))
df
i j v
1 1 1 3
2 2 1 1
3 3 1 9
4 1 2 8
5 2 2 5
6 3 2 4
7 1 3 7
8 2 3 2
9 3 3 6
that I want to transform to matrix M such that
M[i,j]<-df$v[which(df$i==i & df$j==j)]
is there an easy way to do that?
Based on your description, you can just do,
matrix(df$v, ncol = max(df$j))
# [,1] [,2] [,3]
#[1,] 2 4 7
#[2,] 3 1 5
#[3,] 8 6 9
Data Used:
dput(df)
structure(list(i = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), j = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), v = c(2L, 3L, 8L, 4L, 1L, 6L,
7L, 5L, 9L)), class = "data.frame", row.names = c(NA, -9L))

R - stratified sampling for Person Period file

Following up this question, I wondered how I can effectively sample a stratified Person Period file.
I have a database who looks like this
id time var clust
1: 1 1 a clust1
2: 1 2 c clust1
3: 1 3 c clust1
4: 2 1 a clust1
5: 2 2 a clust1
...
With individuals id grouped into clusters clust. What I would like is to sample id by clust, keeping the person period format.
The solution I came up with is to sample id and then to merge back. However, is it not a very elegant solution.
library(data.table)
library(dplyr)
setDT(dt)
dt[,.SD[sample(.N,1)],by = clust] %>%
merge(., dt, by = 'id')
which gives
id clust.x time.x var.x time.y var.y clust.y
1: 2 clust1 1 a 1 a clust1
2: 2 clust1 1 a 2 a clust1
3: 2 clust1 1 a 3 c clust1
4: 3 clust2 3 c 1 a clust2
5: 3 clust2 3 c 2 b clust2
6: 3 clust2 3 c 3 c clust2
7: 5 clust3 1 a 1 a clust3
8: 5 clust3 1 a 2 a clust3
9: 5 clust3 1 a 3 c clust3
Is there a more straightforward solution ?
library(data.table)
dt = setDT(structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("1", "2",
"3", "4", "5", "6"), class = "factor"), time = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), .Label = c("1", "2", "3"), class = "factor"), var = structure(c(1L,
3L, 3L, 1L, 1L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 3L, 2L, 2L,
3L), .Label = c("a", "b", "c"), class = "factor"), clust = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L), .Label = c("clust1", "clust2", "clust3"), class = "factor")), .Names = c("id",
"time", "var", "clust"), row.names = c(NA, -18L), class = "data.frame"))
Here is a variant following #Frank's comment that might help, essentially you can sample a unique id from each clust group and find out the corresponding index number with .I for subsetting:
dt[dt[, .I[id == sample(unique(id),1)], clust]$V1]
# id time var clust
#1: 2 1 a clust1
#2: 2 2 a clust1
#3: 2 3 c clust1
#4: 3 1 a clust2
#5: 3 2 b clust2
#6: 3 3 c clust2
#7: 4 1 a clust3
#8: 4 2 b clust3
#9: 4 3 c clust3
I think tidy data here would have an ID table where cluster is an attribute:
idDT = unique(dt[, .(id, clust)])
id clust
1: 1 clust1
2: 2 clust1
3: 3 clust2
4: 4 clust3
5: 5 clust3
6: 6 clust2
From there, sample...
my_selection = idDT[, .(id = sample(id, 1)), by=clust]
and merge or subset
dt[ my_selection, on=names(my_selection) ]
# or
dt[ id %in% my_selection$id ]
I would keep the intermediate table my_selection around, expecting it to come in handy later.

Setting incomparables in place with merge

I'm seeing some unexpected behaviour with merge (or at least not entirely intuitive). But perhaps I'm just not understanding how it's supposed to work:
Let's create some dummy data to play with first:
x <- structure(list(A = c(2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), B = c(2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L
), C = c(2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L), D = c(2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L), E = c(2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L), F = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), G = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L),
H = c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L), I = c(1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L),
J = c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 1L), K = c(3, 3, 1, 3, 1, 3, 1, 2,
2, 2, 1, 3, 2, 2, 2, 1, NA, 1, 2, 1)), .Names = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K"), row.names = c(NA,
20L), class = "data.frame")
# Generate Listing of All Possible Combinations
y <- list(1:2); y = expand.grid(rep(y,10));
colnames(y) <- LETTERS[1:10]
y <- rbind(y,y,y)
y$K <- rep(1:3,each=1024)
y$mergekey <- sample(1:6,3072,replace=TRUE)
My expectation is that when I merge these two data sets that setting sort=FALSE and all.x=TRUE would provide me with a list of all x in place with mergekey.
Let's try that:
merge(x,y,all.x=TRUE,sort=FALSE)
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 5
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 3
4 2 2 1 2 2 1 2 2 2 2 3 2
5 1 1 2 2 2 2 2 1 2 2 1 4
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 5
8 2 1 2 2 1 1 2 2 1 1 2 4
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 2
11 2 1 2 1 1 1 2 1 2 2 1 4
12 2 2 1 2 1 2 2 1 2 1 3 5
13 2 1 2 1 1 1 2 1 2 2 2 3
14 2 1 2 1 1 1 2 1 2 2 2 3
15 2 2 2 1 2 1 2 1 2 2 2 1
16 2 1 1 2 1 1 2 2 2 2 2 1
17 2 1 1 1 1 1 2 1 1 2 1 2
18 1 2 1 1 1 2 2 1 1 1 1 5
19 2 1 2 1 1 1 2 1 1 1 1 4
20 2 2 1 2 1 1 1 2 1 2 NA NA
Now it seems that "most of x is unsorted" but incomparables are pushed to the end, rather than maintaining their order.
So, my question is: How do I get the incomparables to stay in place?
PS: Does it not seem a little unintuitive to push incomparables to the end if the merge has been told not to sort? I don't find this congruent with this behaviour either
The join function in the plyr package solves this problem intuitively without additional arguements.
library(plyr)
join(x,y)
Joining by: A, B, C, D, E, F, G, H, I, J, K
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 4
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 5
4 2 2 1 2 2 1 2 2 2 2 3 3
5 1 1 2 2 2 2 2 1 2 2 1 6
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 4
8 2 1 2 2 1 1 2 2 1 1 2 2
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 6
11 2 1 2 1 1 1 2 1 2 2 1 1
12 2 2 1 2 1 2 2 1 2 1 3 3
13 2 1 2 1 1 1 2 1 2 2 2 2
14 2 2 2 1 2 1 2 1 2 2 2 6
15 2 1 1 2 1 1 2 2 2 2 2 2
16 2 1 1 1 1 1 2 1 1 2 1 3
17 2 2 1 2 1 1 1 2 1 2 NA NA
18 1 2 1 1 1 2 2 1 1 1 1 1
19 2 1 2 1 1 1 2 1 2 2 2 2
20 2 1 2 1 1 1 2 1 1 1 1 1

Resources