Setting incomparables in place with merge - r

I'm seeing some unexpected behaviour with merge (or at least not entirely intuitive). But perhaps I'm just not understanding how it's supposed to work:
Let's create some dummy data to play with first:
x <- structure(list(A = c(2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), B = c(2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L
), C = c(2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L), D = c(2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L), E = c(2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L), F = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), G = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L),
H = c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L), I = c(1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L),
J = c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 1L), K = c(3, 3, 1, 3, 1, 3, 1, 2,
2, 2, 1, 3, 2, 2, 2, 1, NA, 1, 2, 1)), .Names = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K"), row.names = c(NA,
20L), class = "data.frame")
# Generate Listing of All Possible Combinations
y <- list(1:2); y = expand.grid(rep(y,10));
colnames(y) <- LETTERS[1:10]
y <- rbind(y,y,y)
y$K <- rep(1:3,each=1024)
y$mergekey <- sample(1:6,3072,replace=TRUE)
My expectation is that when I merge these two data sets that setting sort=FALSE and all.x=TRUE would provide me with a list of all x in place with mergekey.
Let's try that:
merge(x,y,all.x=TRUE,sort=FALSE)
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 5
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 3
4 2 2 1 2 2 1 2 2 2 2 3 2
5 1 1 2 2 2 2 2 1 2 2 1 4
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 5
8 2 1 2 2 1 1 2 2 1 1 2 4
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 2
11 2 1 2 1 1 1 2 1 2 2 1 4
12 2 2 1 2 1 2 2 1 2 1 3 5
13 2 1 2 1 1 1 2 1 2 2 2 3
14 2 1 2 1 1 1 2 1 2 2 2 3
15 2 2 2 1 2 1 2 1 2 2 2 1
16 2 1 1 2 1 1 2 2 2 2 2 1
17 2 1 1 1 1 1 2 1 1 2 1 2
18 1 2 1 1 1 2 2 1 1 1 1 5
19 2 1 2 1 1 1 2 1 1 1 1 4
20 2 2 1 2 1 1 1 2 1 2 NA NA
Now it seems that "most of x is unsorted" but incomparables are pushed to the end, rather than maintaining their order.
So, my question is: How do I get the incomparables to stay in place?
PS: Does it not seem a little unintuitive to push incomparables to the end if the merge has been told not to sort? I don't find this congruent with this behaviour either

The join function in the plyr package solves this problem intuitively without additional arguements.
library(plyr)
join(x,y)
Joining by: A, B, C, D, E, F, G, H, I, J, K
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 4
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 5
4 2 2 1 2 2 1 2 2 2 2 3 3
5 1 1 2 2 2 2 2 1 2 2 1 6
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 4
8 2 1 2 2 1 1 2 2 1 1 2 2
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 6
11 2 1 2 1 1 1 2 1 2 2 1 1
12 2 2 1 2 1 2 2 1 2 1 3 3
13 2 1 2 1 1 1 2 1 2 2 2 2
14 2 2 2 1 2 1 2 1 2 2 2 6
15 2 1 1 2 1 1 2 2 2 2 2 2
16 2 1 1 1 1 1 2 1 1 2 1 3
17 2 2 1 2 1 1 1 2 1 2 NA NA
18 1 2 1 1 1 2 2 1 1 1 1 1
19 2 1 2 1 1 1 2 1 2 2 2 2
20 2 1 2 1 1 1 2 1 1 1 1 1

Related

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Randomly pick a value from a set of rows and add value to new row below

my R skills are not sufficient to tackle this issue so I'm hoping someone can help.
My data look like this:
head(human.players,25)
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
5
1
1
2
1
1
0
0
NA
2020
1
5
2
1
2
1
1
1
0
NA
2020
1
5
3
1
2
NA
NA
NA
NA
NA
2020
1
6
1
1
2
0
0
0
0
NA
2020
1
6
2
1
2
0
0
0
0
NA
2020
1
6
3
1
2
NA
NA
NA
NA
NA
2020
1
7
1
1
2
0
1
1
1
NA
2020
1
7
2
1
2
1
0
0
1
NA
2020
1
7
3
1
2
NA
NA
NA
NA
NA
2020
2
1
1
1
3
1
1
0
0
NA
2020
2
1
2
1
3
0
0
0
1
NA
2020
2
1
3
1
3
0
1
1
0
NA
Vars from q1_a:q5_a represent whether a player got the question wrong (0) or correct (1). Each player plays in a specific round (there are 7 rounds per episode). In the first 4 rounds, there are 3 players. However, in rounds 5-7, there are only 2 players (the one player that is eliminated has NA's - e.g., in episode 1, this is player 3 - see table above).
I need to create a random player. This means that for the first 4 rounds I need to randomly select an answer (for each of the 5 questions) from the three players within that round and add the "random player" row value. For rounds 5 to 7 I need to select an answer from the two players (ignoring the NA) and add the "random player" row value.
An algorithm of sorts would have to look at round 1 (only those rows), sample one value from the three rows, paste it into round 1 (i.e., create row 4 in this example ) and do that for each of the 5 questions. Then for round 2...
This is how it is supposed to look like where I've added player 4 - the random player:
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
1
4
1
3
0
0
1
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
2
4
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
3
4
1
3
0
0
0
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
4
4
1
3
0
0
1
1
NA
Writing this, I'm thinking it may be impossible or at least very difficult to do this so this question is more like a "hail mary". I presume some combination of sample(), apply(), and creating a custom function is necessary, but I'm stumped.
Here's one pipe that samples the new players and their scores into a separate frame, which you can then bind_rows back into the original data.
set.seed(2021)
newplayers <- dat %>%
filter(!is.na(q1_a)) %>%
group_by(Season, Episode, Round) %>%
summarize(across(everything(), ~ sample(., size=1)), .groups = "drop") %>%
mutate(Player = NA_integer_, Player_type = NA_integer_)
newplayers
# # A tibble: 8 x 11
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <lgl>
# 1 2020 1 1 NA NA 3 0 0 1 1 NA
# 2 2020 1 2 NA NA 3 1 1 0 0 NA
# 3 2020 1 3 NA NA 3 0 1 1 0 NA
# 4 2020 1 4 NA NA 3 0 0 1 1 NA
# 5 2020 1 5 NA NA 2 1 1 1 0 NA
# 6 2020 1 6 NA NA 2 0 0 0 0 NA
# 7 2020 1 7 NA NA 2 0 0 1 1 NA
# 8 2020 2 1 NA NA 3 0 1 0 0 NA
bind_rows(dat, newplayers) %>%
arrange(Season, Episode, Round, is.na(Player), Player) %>%
head(.)
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# 1 2020 1 1 1 1 3 0 1 0 0 NA
# 2 2020 1 1 2 1 3 0 1 1 1 NA
# 3 2020 1 1 3 1 3 0 0 0 1 NA
# 4 2020 1 1 NA NA 3 0 0 1 1 NA
# 5 2020 1 2 1 1 3 1 1 0 1 NA
# 6 2020 1 2 2 1 3 1 0 1 0 NA
I didn't know what values to put into Player*, so I chose NA.
Data
# dput(dat)
dat <- structure(list(Season = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L), Episode = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), Round = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L), Player = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Player_type = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Crowd_size = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), q1_a = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 0L, 1L, NA, 1L, 0L, 0L), q2_a = c(1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 1L, 0L, 1L), q3_a = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 0L, 0L, 1L), q4_a = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, NA, 0L, 0L, NA, 1L, 1L, NA, 0L, 1L, 0L), q5_a = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, -24L))

Finding the maximum square sub-matrix with all equal elements

Does anybody know how it could be possible to subset the maximum K such that K x K is a submatrix with all identical elements, i.e., all the elements in this submatrix must be the same from a given a N x N matrix?
I found many examples in other programming languages except R. I also prefer dplyr if you know.
There is a link to the solution with other languages:
https://www.geeksforgeeks.org/maximum-size-sub-matrix-with-all-1s-in-a-binary-matrix/
But this link provides a special case when all identical elements are next to each other. It retrieves a maximum block of the same elements, not a submatrix in general. I do not want to limit subsetting with this condition.
Here is a base R implementation to make it.
If you want to search the maximum square sub-matrix within a non-square matrix, you can try the code below:
r <- list()
for (w in rev(seq(min(dim(M))))) {
for (rs in seq(nrow(M)-w+1)) {
for (cs in seq(ncol(M)-w+1)) {
mat <- M[rs-1+(1:w),cs-1+(1:w)]
u <- unique(c(mat))
if (all(u!=0) &length(u)==1) r[[length(r)+1]] <- mat
}
}
if (length(r)>0) break
}
such that
> r
[[1]]
[,1] [,2]
[1,] 3 3
[2,] 3 3
[[2]]
[,1] [,2]
[1,] 2 2
[2,] 2 2
[[3]]
[,1] [,2]
[1,] 3 3
[2,] 3 3
[[4]]
[,1] [,2]
[1,] 2 2
[2,] 2 2
[[5]]
[,1] [,2]
[1,] 1 1
[2,] 1 1
[[6]]
[,1] [,2]
[1,] 1 1
[2,] 1 1
[[7]]
[,1] [,2]
[1,] 3 3
[2,] 3 3
[[8]]
[,1] [,2]
[1,] 3 3
[2,] 3 3
DATA
M <- structure(c(1L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 3L, 2L, 1L,
1L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 3L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L,
3L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 2L, 1L, 3L, 3L, 2L, 3L, 2L, 1L,
2L, 1L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 2L, 2L,
1L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L), .Dim = c(15L, 10L))
> M
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 2 2 1 1 3 2 2 1 3
[2,] 3 2 1 3 3 1 2 3 1 3
[3,] 1 2 3 2 3 1 2 2 2 1
[4,] 2 3 1 2 2 2 3 1 2 1
[5,] 1 1 3 3 3 1 2 2 2 2
[6,] 3 3 2 3 3 1 2 1 1 2
[7,] 3 1 2 2 2 1 3 3 1 1
[8,] 2 1 2 2 3 1 3 3 1 2
[9,] 2 1 2 2 3 3 3 1 2 3
[10,] 3 1 3 2 1 2 1 2 1 3
[11,] 3 2 2 1 1 1 2 1 3 3
[12,] 1 1 1 2 1 1 2 3 2 3
[13,] 1 1 3 2 1 3 1 2 3 3
[14,] 1 2 2 2 3 3 3 3 3 1
[15,] 2 2 1 2 2 3 3 3 2 3
EDIT
The approach above is inefficient when with large matrix since all combinations checked. The method below is a R implementation of algorithm stated in https://www.geeksforgeeks.org/maximum-size-sub-matrix-with-all-1s-in-a-binary-matrix/, which is is far more efficient.
M <- unname(as.matrix(read.csv(file = "test2.csv")))
S <- matrix(0,nrow = nrow(M),ncol = ncol(M))
S[,1] <- M[,1]
for (i in 1:nrow(S)) {
for (j in 2:ncol(S)) {
if (M[i,j]==1) {
if (i==1) {
S[i,j] <- M[i,j]
} else {
S[i,j] <- min(c(S[i,j-1],S[i-1,j],S[i-1,j-1]))+1
}
}
}
}
inds <- which(S == max(S),arr.ind = TRUE)
w <- seq(max(S))-1
res <- lapply(seq(nrow(inds)), function(k) M[inds[k,"row"]-w,inds[k,"col"]-w])
I found the following answer to this question using dplyr:
M1 <- M %>% data.frame %>% mutate(sumVar = rowSums(.)) %>%
arrange(desc(sumVar)) %>% dplyr::select(-sumVar)
M2 <- M1 %>% as.matrix %>% t %>% data.frame %>%
mutate(sumVar = rowSums(.)) %>% arrange(desc(sumVar)) %>%
dplyr::select(-sumVar) %>% as.matrix %>% t %>% data.frame %>%
arrange_all(funs(desc(.)))
i <- 1
j <- 1
while(sum(M2[1:i,1:j]) == i*j){
i <- i+1
j <- j+1
M3 <- M2[1:i-1,1:j-1]
}
This is a toy data as #ThomasIsCoding proposed:
M <- structure(c(1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Dim = c(5L,
5L))
and this is the result:
> M
[,1] [,2] [,3] [,4] [,5]
[1,] 1 1 1 0 1
[2,] 1 1 1 1 1
[3,] 1 1 1 1 1
[4,] 1 1 1 1 1
[5,] 0 1 1 1 1
> M1
X1 X2 X3 X4 X5
1 1 1 1 1 1
2 1 1 1 1 1
3 1 1 1 1 1
4 1 1 1 0 1
5 0 1 1 1 1
> M2
X1 X2 X3 X4 X5
1 1 1 1 1 1
2 1 1 1 1 1
3 1 1 1 1 1
4 1 1 1 1 0
5 1 1 1 0 1
> M3
X1 X2 X3 X4
1 1 1 1 1
2 1 1 1 1
3 1 1 1 1
4 1 1 1 1
Note that some more functions should be added to keep the variable names and find them after using arrange!

Dataframe column to matrix by two other columns

I have a dataframe
df<-data.frame(i=rep(1:3,3),j=sort(rep(1:3,3)),v=sample(1:9,9))
df
i j v
1 1 1 3
2 2 1 1
3 3 1 9
4 1 2 8
5 2 2 5
6 3 2 4
7 1 3 7
8 2 3 2
9 3 3 6
that I want to transform to matrix M such that
M[i,j]<-df$v[which(df$i==i & df$j==j)]
is there an easy way to do that?
Based on your description, you can just do,
matrix(df$v, ncol = max(df$j))
# [,1] [,2] [,3]
#[1,] 2 4 7
#[2,] 3 1 5
#[3,] 8 6 9
Data Used:
dput(df)
structure(list(i = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), j = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), v = c(2L, 3L, 8L, 4L, 1L, 6L,
7L, 5L, 9L)), class = "data.frame", row.names = c(NA, -9L))

count frequency based on values in 2 or more columns

I have a pretty simple question but I can't think of a way to do this without using if statements
The data I have looks something like:
df <- structure(list(years = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), id = c(1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), x = structure(c(2L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L), .Label = c("E", "I"), class = "factor")), .Names = c("years",
"id", "x"), class = "data.frame", row.names = c(NA, -18L))
so the table looks like:
years id x
1 1 1 I
2 2 1 E
3 3 1 E
4 1 1 E
5 2 1 I
6 3 1 I
7 1 2 I
8 2 2 E
9 3 2 I
10 1 2 E
11 2 2 E
12 3 2 I
13 1 3 I
14 2 3 E
15 3 3 I
16 1 3 I
17 2 3 I
18 3 3 E
I would like the output to report the fraction of x's that are "I" for each id and each year:
years id xnew
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5
Any help would be greatly appreciated! Thank you!
aggregate(x ~ years + id, data=df, function(y) sum(y=="I")/length(y) )
years id x
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5

Resources