I'm trying to merge two data.frames in R so that the values in test_1 are overwritten if they exist in test_2
Every time I try and join/merge them I end up with a bunch of NAs.
I can't work out a simple logic for if.na then use test_2. Is that what I'd need to do, or is there an easier way?
test_1 = structure(list(rn = c("Red", "Blue",
"Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(0, 0, 0, 0, 0, 0), X2022.08.02 = c(0, 0, 0,
0, 0, 0), X2022.08.03 = c(0, 0, 0, 0, 0, 0), X2022.08.04 = c(0,
0, 0, 0, 0, 0), X2022.08.05 = c(0, 0, 0, 0, 0, 0), X2022.08.08 = c(0,
0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0,
0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0,
0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0,
0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0,
0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0,
0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0,
0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(0,
0, 0, 0, 0, 0), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0,
0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
test_2 = structure(list(rn = c("Blue", "Pink",
"Red", "Yellow", "Green", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.03 = c(10, 10, 10,
10, 10, 10), X2022.08.04 = c(10, 10, 10, 10, 10, 10), X2022.08.05 = c(10,
10, 10, 10, 10, 10), X2022.08.26 = c(10, 10, 10, 10, 10, 10)), row.names = c(NA,
6L), class = "data.frame")
the desired output would look like this:
test_output = structure(list(rn = c("Red", "Blue",
"Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.02 = c(0, 0, 0,
0, 0, 0), X2022.08.03 = c(10, 10, 10, 10, 10, 10), X2022.08.04 = c(10,
10, 10, 10, 10, 10), X2022.08.05 = c(10, 10, 10, 10, 10, 10), X2022.08.08 = c(0,
0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0,
0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0,
0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0,
0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0,
0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0,
0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0,
0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(10,
10, 10, 10, 10, 10), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0,
0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
If you use the column names of the source as indices on both sides of the assignment, you can get replacement with
test_1[ ,colnames(test_2)] <- test_2[ , colnames(test_2)]
test_1
rn X2022.08.01 X2022.08.02 X2022.08.03 X2022.08.04 X2022.08.05 X2022.08.08 X2022.08.09
1 Blue 10 0 10 10 10 0 0
2 Pink 10 0 10 10 10 0 0
3 Red 10 0 10 10 10 0 0
4 Yellow 10 0 10 10 10 0 0
5 Green 10 0 10 10 10 0 0
6 Gold 10 0 10 10 10 0 0
X2022.08.10 X2022.08.11 X2022.08.12 X2022.08.15 X2022.08.16 X2022.08.17 X2022.08.18 X2022.08.19
1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
X2022.08.22 X2022.08.23 X2022.08.24 X2022.08.25 X2022.08.26 X2022.08.29 X2022.08.30 X2022.08.31
1 0 0 0 0 10 0 0 0
2 0 0 0 0 10 0 0 0
3 0 0 0 0 10 0 0 0
4 0 0 0 0 10 0 0 0
5 0 0 0 0 10 0 0 0
6 0 0 0 0 10 0 0 0
I doubt there could be a more simple method.. It could even work with a more limited number of rows as long as the indices area proper subset of the column and row names of the target matrix or dataframe. Note: I'm not understanding the issue with NA's. There were not NA's in either structure.
Related
i have two dataframes with another sizes but partly same row and column names.
I want to filter tpm_datExpr by finding the common column and row names with datTraits.
So I want my tpm_datExpr dataframe to have 278 columns.
> colnames(tpm_datExpr)[1:10]
[1] "D5247_S53_L006" "D5248_S54_L006" "D5249_S67_L008" "E02874_L1_S1_L001"
[5] "E02875_L1_S2_L001" "E02876_L1_S3_L001" "E02877_L1_S4_L001" "E02878_L1_S5_L001"
[9] "E02879_L1_S6_L001" "E02880_L1_S7_L001"
> rownames(datTraits)[1:10]
[1] "D5247_S53_L006" "D5248_S54_L006" "D5249_S67_L008" "E02874_L1_S1_L001"
[5] "E02875_L1_S2_L001" "E02876_L1_S3_L001" "E02877_L1_S4_L001" "E02878_L1_S5_L001"
[9] "E02879_L1_S6_L001" "E02880_L1_S7_L001"
> ncol(tpm_datExpr)
[1] 623
> nrow(datTraits)
[1] 278
You may use intersect on rownames and names. Example:
df1[intersect(rownames(df1), rownames(df2)),
intersect(names(df1), names(df2))]
# X1 X5 X6
# 2 0 0 0
# 4 0 0 0
# 8 0 0 0
# 9 0 0 0
Data:
df1 <- structure(list(X1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0), X4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X5 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0), X6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-10L))
df2 <- structure(list(X1 = c(0, 0, 0, 0), X5 = c(0, 0, 0, 0), X6 = c(0,
0, 0, 0)), row.names = c(2L, 4L, 8L, 9L), class = "data.frame")
I'm generating random matrices filled with zero and ones. The dimension of them might be different for each simulation.
An example matrix below
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0 0 0 0 0 0 1 0 0
[2,] 0 1 1 0 0 0 0 0 0 0
[3,] 0 0 0 0 1 0 0 0 0 1
[4,] 0 1 0 0 0 0 0 0 0 0
[5,] 0 0 0 0 1 0 0 0 0 1
[6,] 1 0 1 0 0 0 1 1 1 0
[7,] 0 0 0 0 0 0 1 1 0 0
[8,] 0 0 0 0 0 0 0 0 0 0
[9,] 0 0 1 0 0 1 0 0 1 1
[10,] 0 0 0 0 0 0 0 1 0 0
And a little visualisation
Dput version.
structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), .Dim = c(10L,
10L))
I would like to compute two things:
the number of clusters formed by ones (by cluster we mean a set of adjacent ones, where the elements on the diagonal are not adjacent),
the number of ones within each cluster.
I think I managed to solve the first point with this function
library(raster)
count_clusters <- function(grid) {
attr(clump(raster(grid), direc=4), 'data')#max
}
This function would return 14 for the matrix above which is correct.
Unfortunately I don't how to solve the second task. The needed function should return the following output: c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 5).
I would appreciate any hints or tips.
To compute the number of ones within each cluster:
grid <-structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), .Dim = c(10L,
10L)) + 10L))
x <- clump(raster(grid), direc=4)
get the values from the RasterLayer #data#values.
vals <- x#data#values
Create a data frame with the values:
dt <- tibble(cluster = vals)
Remove NA values, group by cluster and count
result <- dt %>%
filter(!is.na(cluster)) %>%
group_by(cluster) %>%
tally()
result$n
[1] 1 2 1 1 1 1 1 1 1 5 1 1 2 1
require(gtsummary)
test <- structure(list(`1` = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), `2` = c(1,0, 0, 0, 0, 1, 0, 1, 0, 0), `3` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `4` = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0), `5` = c(1, 0, 1, 1,0, 1, 1, 0, 0, 0), `6` = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0), `7` = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), `8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `9` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `10` = c(0, 0, 0,0, 0, 0, 0, 0, 0, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
In this example data, I have 10 categorical variables.
`1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 1 0 1 1 0 0 0 0 0
2 0 0 0 1 0 0 0 0 0 0
3 0 0 0 0 1 0 0 0 0 0
4 0 0 0 0 1 1 0 0 0 0
5 0 0 0 1 0 0 0 0 0 0
6 0 1 0 0 1 0 0 0 0 0
7 0 0 0 0 1 1 0 0 0 0
8 0 1 0 0 0 0 0 0 0 0
9 1 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 1
Since they can overlap each other, I have put them in different columns,
using 0 and 1, indicatting "yes" or "no" to having (or not having) the categorical variable.
When test %>% tbl_summary(), it creates:
I would like to sort this by frequency, but
test %>% tbl_summary(sort = list(everything() ~ "frequency"))
does not work.
Is there anyway to do this?
Thank you in advance.
The tbl_summary(sort=) argument sorts levels within a variable, not the order the variables appear in the table. Variables are appear in the table in the same order they appear in the data frame.
We can update the order in the data frame using the code below.
library(gtsummary)
#> #Uighur
packageVersion("gtsummary")
#> [1] '1.5.0'
test <- structure(list(`1` = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), `2` = c(1,0, 0, 0, 0, 1, 0, 1, 0, 0), `3` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `4` = c(1, 1, 0, 0, 1, 0, 0, 0, 0, 0), `5` = c(1, 0, 1, 1,0, 1, 1, 0, 0, 0), `6` = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0), `7` = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), `8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), `9` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `10` = c(0, 0, 0,0, 0, 0, 0, 0, 0, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl", "data.frame"))
# order variables by prevelence
prev <- purrr::map_dbl(test, mean) %>% sort(decreasing = TRUE)
test %>%
select(all_of(names(prev))) %>%
tbl_summary() %>%
as_kable() # convert to kable for SO
Characteristic
N = 10
5
5 (50%)
2
3 (30%)
4
3 (30%)
6
2 (20%)
1
1 (10%)
10
1 (10%)
3
0 (0%)
7
0 (0%)
8
0 (0%)
9
0 (0%)
Created on 2021-12-10 by the reprex package (v2.0.1)
I am having a data frame that has various combinations as follows:
structure(list(`Q1` = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0), `Q2` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Q3` = c(0, 1, 0, 0, 0, 1, 1, 0, 0,
0), `Q4` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `Q5` = c(0, 0, 1, 0,
0, 1, 0, 1, 1, 0), `Q6` = c(1, 1, 0, 1, 1, 0, 0, 1, 1, 1), `Q7` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Q8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
1), `Q9` = c(1, 0, 1, 0, 0, 1, 1, 0, 1, 0), `Q10` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `Q11` = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
`Q12` = c(1, 1, 1, 1, 1, 0, 1, 1, 0, 1)), row.names = c(NA,
-10L), class = "data.frame")
I am having a base data frame where I have different combinations with the weightage for each combination.
structure(list(Q1 = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Q2 = c(0,
1, 1, 0, 0, 0, 0, 0, 0, 0), Q3 = c(1, 0, 0, 1, 0, 0, 0, 0, 0,
0), Q4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q5 = c(1, 0, 1, 0,
0, 0, 1, 0, 0, 1), Q6 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 1), Q7 = c(0,
0, 1, 1, 1, 0, 0, 0, 0, 0), Q8 = c(1, 0, 1, 0, 0, 1, 0, 0, 0,
0), Q9 = c(1, 0, 0, 0, 0, 0, 0, 1, 1, 0), Q10 = c(0, 0, 1, 0,
0, 1, 0, 0, 0, 0), Q11 = c(0, 0, 1, 0, 0, 1, 0, 0, 0, 0), Q12 = c(1,
0, 0, 0, 1, 0, 1, 0, 0, 0), RatingBinary = c(1, 1, 0, 1, 0, 1,
0, 1, 1, 1)), row.names = c(NA, 10L), class = "data.frame")
The problem statement is for each 1's combination in 1st data frame (i.e.Q6, Q9, Q12 in 1st row, Q3, Q6, Q12 in 2nd row), I need to get the number of rows that get satisfied in the base data frame.
For example: In the combination data frame (1st Df), in the 1st row Q6, Q9 & Q12 have the binary value 1. I need to get the count of this combination(Q6, Q9 & Q12 which have 1's) in the base data and get the number of rows that have the RatingBinary values 0's and 1's.
How can I get this implemented in R? Can anyone suggest a suitable solution for this scenario?
Here's an algorithmic approach.
Let's call a set in the first data frame a combo set; this is a set of three questions in a given row. Let's also call a set in the base data a base set; this is the set in a given row for which we are trying to find whether a given combo set is part of.
The approach is essentially to iterate through each combo set and find matches over all base sets. Sets seem to only be in threes, so I take advantage of that by hard coding a sum == 3 rather than doing an agnostic match. We store matches in a structure I call pair. A match is indicated by a 1. I define pair(x,y) where x is the row number of the combo data set and y is the row number of base dataset.
pair <- matrix(nrow = 10, ncol = 10)
for(i in 1:nrow(df)) {
ind <- which(df[i,] == 1)
for(j in 1:nrow(df2)) {
if(sum(df2[j, ind]) == 3){
pair[i,j] <- 1
} else {
pair[i,j] <- 0
}
}
}
The pair object is:
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 0 0 0
[2,] 1 0 0 0 0 0 0 0 0 0
[3,] 1 0 0 0 0 0 0 0 0 0
[4,] 0 0 0 0 0 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0 0 0
[6,] 1 0 0 0 0 0 0 0 0 0
[7,] 1 0 0 0 0 0 0 0 0 0
[8,] 1 0 0 0 0 0 0 0 0 0
[9,] 1 0 0 0 0 0 0 0 0 0
[10,] 1 0 0 0 0 0 0 0 0 0
This means for only the first combo set did we find matches in all the base sets except for base set 4 and base set 5. Because there is only one match, the answer to your second question about the number of rows that have RatingBinary 0 or 1 becomes trivial -- it's just the RatingBinary for that row/base set in the base data set.
I have a binary raster consisting of objects (1) and background (0). How can I find bounding boxes of objects? Each object should have its own bouding box.
Input:
library("raster")
mat = matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0,
0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE
)
ras = raster(mat)
I expect this result:
result = raster(matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0,
0, 1, 0, 0, 1, 0,
0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE
))
Here in an approach
Example data
library(raster)
mat = matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0,
0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE )
ras <- raster(mat)
Solution
f <- function(r) {
x <- reclassify(ras, cbind(0,NA))
y <- rasterToPolygons(x, dissolve=TRUE)
z <- disaggregate(y)
e <- sapply(1:length(z), function(i) extent(z[i,]))
p <- spPolygons(e)
r <- rasterize(p, r)
d <- boundaries(r)
reclassify(d, cbind(NA, 0))
}
r <- f(res)
as.matrix(r)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 0 0 0 0 0 0
#[2,] 0 1 1 1 1 0
#[3,] 0 1 1 1 1 0
#[4,] 0 0 0 0 0 0
#[5,] 0 1 1 1 1 0
#[6,] 0 1 0 0 1 0
#[7,] 0 1 1 1 1 0
#[8,] 0 0 0 0 0 0
It is of course possible that bounding boxes of objects overlap, in which there is no solution, I suppose.