Related
I'm generating random matrices filled with zero and ones. The dimension of them might be different for each simulation.
An example matrix below
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0 0 0 0 0 0 1 0 0
[2,] 0 1 1 0 0 0 0 0 0 0
[3,] 0 0 0 0 1 0 0 0 0 1
[4,] 0 1 0 0 0 0 0 0 0 0
[5,] 0 0 0 0 1 0 0 0 0 1
[6,] 1 0 1 0 0 0 1 1 1 0
[7,] 0 0 0 0 0 0 1 1 0 0
[8,] 0 0 0 0 0 0 0 0 0 0
[9,] 0 0 1 0 0 1 0 0 1 1
[10,] 0 0 0 0 0 0 0 1 0 0
And a little visualisation
Dput version.
structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), .Dim = c(10L,
10L))
I would like to compute two things:
the number of clusters formed by ones (by cluster we mean a set of adjacent ones, where the elements on the diagonal are not adjacent),
the number of ones within each cluster.
I think I managed to solve the first point with this function
library(raster)
count_clusters <- function(grid) {
attr(clump(raster(grid), direc=4), 'data')#max
}
This function would return 14 for the matrix above which is correct.
Unfortunately I don't how to solve the second task. The needed function should return the following output: c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 5).
I would appreciate any hints or tips.
To compute the number of ones within each cluster:
grid <-structure(c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), .Dim = c(10L,
10L)) + 10L))
x <- clump(raster(grid), direc=4)
get the values from the RasterLayer #data#values.
vals <- x#data#values
Create a data frame with the values:
dt <- tibble(cluster = vals)
Remove NA values, group by cluster and count
result <- dt %>%
filter(!is.na(cluster)) %>%
group_by(cluster) %>%
tally()
result$n
[1] 1 2 1 1 1 1 1 1 1 5 1 1 2 1
Using dplyr functions, I want to remove rows in which only column b equals 1 and the rest of columns are all 0.
Although I can do this:
library(dplyr, warn.conflicts = FALSE)
trb <-
tribble(~a, ~b, ~c,
1, 1, 1,
1, 1, 0,
1, 0, 1,
0, 1, 0, # <~~~ remove this
0, 0, 0,
0, 1, 0 # <~~~ remove this
)
trb %>%
filter(!(b == 1 & a == 0 & c == 0))
#> # A tibble: 4 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 0
#> 3 1 0 1
#> 4 0 0 0
I'm looking for a more scalable solution to account for data such as:
trb_2 <-
tibble::tribble(
~a, ~b, ~c, ~d, ~e, ~f, ~g, ~h, ~i, ~j, ~k, ~l, ~m, ~n, ~o, ~p, ~q, ~r, ~s, ~t, ~u, ~v, ~w, ~x, ~y, ~z,
0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1
)
In trb_2 I still want to remove the rows in which b equals 1 and all the rest are 0.
Is there a scalable way to achieve this using dplyr::filter()?
Yes, using the new helper function dplyr::if_all() you can do this for no matter how many columns you have:
trb %>%
filter(!(b == 1 & if_all(-b, ~ .x == 0)))
Result:
# A tibble: 4 x 3
a b c
<dbl> <dbl> <dbl>
1 1 1 1
2 1 1 0
3 1 0 1
4 0 0 0
Breakdown of !(b == 1 & if_all(-b, ~ .x == 0)):
b == 1 will match rows where b is 1
if_all(-b, ~ .x == 0) will match rows where all columns except b are exactly 0
!(b == 1 & if_all(-b, ~ .x == 0)) combines these two expressions and removes the rows where both are true
trb %>%
filter(b != 1 | rowSums(. == 1) != 1)
# # A tibble: 4 x 3
# a b c
# <dbl> <dbl> <dbl>
# 1 1 1 1
# 2 1 1 0
# 3 1 0 1
# 4 0 0 0
I have a dataframe , a column of which contains colon and pound-separated strings.
data$col1
col1
1: 3#Tier_III_Uncertain EVS=[1, 0, 0, 1, 0, 0, 0, 0, 0, -1, 1, 1]
2: 3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0]
3: 4#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0]
4: 2#Tier_IV_benign EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]
5: 3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]
6: 5#Tier_III_Uncertain EVS=[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1]
I want to extract the elements of the string and split it into different columns.
col1 col2 col3 EVS1 ... EVS12
3#Tier_III_Uncertain EVS=[1, 0, 0, 1, 0, 0, 0, 0, 0, -1, 1, 1] 3 Tier_III_Uncertain 1 1
3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0] 3 Tier_III_Uncertain 0 0
4#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0] 4 Tier_III_Uncertain 0 0
2#Tier_IV_benign EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0] 2 Tier_IV_benign 0 0
3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0] 3 Tier_III_Uncertain 0 0
5#Tier_III_Uncertain EVS=[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1] 5 Tier_III_Uncertain 0 1
read.table(text=gsub("[^A-Za-z_0-9-]", " ", data$col1),
col.names = c(paste0('col', 2:4), paste0('EVS', 1:12)))[-3]
col2 col3 EVS1 EVS2 EVS3 EVS4 EVS5 EVS6 EVS7 EVS8 EVS9 EVS10 EVS11 EVS12
1 3 Tier_III_Uncertain 1 0 0 1 0 0 0 0 0 -1 1 1
2 3 Tier_III_Uncertain 0 0 0 1 0 0 0 0 0 1 1 0
3 4 Tier_III_Uncertain 0 0 0 1 0 0 0 0 2 0 1 0
4 2 Tier_IV_benign 0 0 0 1 0 0 0 0 0 0 1 0
5 3 Tier_III_Uncertain 0 0 0 1 0 0 0 0 1 0 1 0
6 5 Tier_III_Uncertain 0 0 1 1 0 0 0 0 1 0 1 1
Assuming DT shown reproducibly in the Note at the end replace non-word characters and also EVS= with space. Then read that using fread and set the names. Finally cbind DT to it.
DT2 <- fread(text = gsub("EVS=|\\W", " ", DT$col1))
names(DT2) <- c("col2", "col3", paste0("EVS", 1:(ncol(DT2)-2)))
cbind(DT, DT2)
Note
library(data.table)
L <- "3#Tier_III_Uncertain EVS=[1, 0, 0, 1, 0, 0, 0, 0, 0, -1, 1, 1]
3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0]
4#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0]
2#Tier_IV_benign EVS=[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]
3#Tier_III_Uncertain EVS=[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]
5#Tier_III_Uncertain EVS=[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1]"
DT <- data.table(col1 = trimws(readLines(textConnection(L))))
I am having a data frame that has various combinations as follows:
structure(list(`Q1` = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0), `Q2` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Q3` = c(0, 1, 0, 0, 0, 1, 1, 0, 0,
0), `Q4` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `Q5` = c(0, 0, 1, 0,
0, 1, 0, 1, 1, 0), `Q6` = c(1, 1, 0, 1, 1, 0, 0, 1, 1, 1), `Q7` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Q8` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
1), `Q9` = c(1, 0, 1, 0, 0, 1, 1, 0, 1, 0), `Q10` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `Q11` = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
`Q12` = c(1, 1, 1, 1, 1, 0, 1, 1, 0, 1)), row.names = c(NA,
-10L), class = "data.frame")
I am having a base data frame where I have different combinations with the weightage for each combination.
structure(list(Q1 = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Q2 = c(0,
1, 1, 0, 0, 0, 0, 0, 0, 0), Q3 = c(1, 0, 0, 1, 0, 0, 0, 0, 0,
0), Q4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q5 = c(1, 0, 1, 0,
0, 0, 1, 0, 0, 1), Q6 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 1), Q7 = c(0,
0, 1, 1, 1, 0, 0, 0, 0, 0), Q8 = c(1, 0, 1, 0, 0, 1, 0, 0, 0,
0), Q9 = c(1, 0, 0, 0, 0, 0, 0, 1, 1, 0), Q10 = c(0, 0, 1, 0,
0, 1, 0, 0, 0, 0), Q11 = c(0, 0, 1, 0, 0, 1, 0, 0, 0, 0), Q12 = c(1,
0, 0, 0, 1, 0, 1, 0, 0, 0), RatingBinary = c(1, 1, 0, 1, 0, 1,
0, 1, 1, 1)), row.names = c(NA, 10L), class = "data.frame")
The problem statement is for each 1's combination in 1st data frame (i.e.Q6, Q9, Q12 in 1st row, Q3, Q6, Q12 in 2nd row), I need to get the number of rows that get satisfied in the base data frame.
For example: In the combination data frame (1st Df), in the 1st row Q6, Q9 & Q12 have the binary value 1. I need to get the count of this combination(Q6, Q9 & Q12 which have 1's) in the base data and get the number of rows that have the RatingBinary values 0's and 1's.
How can I get this implemented in R? Can anyone suggest a suitable solution for this scenario?
Here's an algorithmic approach.
Let's call a set in the first data frame a combo set; this is a set of three questions in a given row. Let's also call a set in the base data a base set; this is the set in a given row for which we are trying to find whether a given combo set is part of.
The approach is essentially to iterate through each combo set and find matches over all base sets. Sets seem to only be in threes, so I take advantage of that by hard coding a sum == 3 rather than doing an agnostic match. We store matches in a structure I call pair. A match is indicated by a 1. I define pair(x,y) where x is the row number of the combo data set and y is the row number of base dataset.
pair <- matrix(nrow = 10, ncol = 10)
for(i in 1:nrow(df)) {
ind <- which(df[i,] == 1)
for(j in 1:nrow(df2)) {
if(sum(df2[j, ind]) == 3){
pair[i,j] <- 1
} else {
pair[i,j] <- 0
}
}
}
The pair object is:
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 0 0 0
[2,] 1 0 0 0 0 0 0 0 0 0
[3,] 1 0 0 0 0 0 0 0 0 0
[4,] 0 0 0 0 0 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0 0 0
[6,] 1 0 0 0 0 0 0 0 0 0
[7,] 1 0 0 0 0 0 0 0 0 0
[8,] 1 0 0 0 0 0 0 0 0 0
[9,] 1 0 0 0 0 0 0 0 0 0
[10,] 1 0 0 0 0 0 0 0 0 0
This means for only the first combo set did we find matches in all the base sets except for base set 4 and base set 5. Because there is only one match, the answer to your second question about the number of rows that have RatingBinary 0 or 1 becomes trivial -- it's just the RatingBinary for that row/base set in the base data set.
I have a list of 60 variables (30 pairs, essentially), and I need to combine the information across all the pairs to create new variables based on the data stored in each pair.
To give some context, I am working on a systematic review of prediction model studies, and I extracted data on which variables were considered for inclusion in the prediction model of each study (the first 30 variables) and which variables were included in the model (the second 30 variables)
All variables are binary.
The first 30 variables are written in the form “p_[varname]”
The second 30 are written in the form “p_[varname]_inc”.
I want to create a new variable that is called [varname] and takes the values “Not considered”, “Considered”, and “Included”.
In Stata, I could easily do this like so:
foreach v of [varname1]-[varname30] {
gen `v' = "Not considered" if p_`v' == 0
replace `v' = "Considered" if p_`v' == 1 & p_`v'_inc == 0
replace `v' = "Included" if p_`v'_inc == 1 & p_`v'_inc == 1
}
In R, the only way I can figure out to do it is by copy and pasting the same ifelse statement for all variables, for example:
predictor_vars %>%
mutate(age = ifelse(p_age==1 & p_age_inc==1, "Included",
ifelse(p_age==1 & p_age_inc==0, "Considered", "Not considered")),
sex = ifelse(p_sex==1 & p_sex_inc==1, "Included",
ifelse(p_sex==1 & p_sex_inc==0, "Considered", "Not considered")),
....
[varname] = ifelse([varname]==1 & [varname]_inc==1, "Included",
ifelse([varname]==1 & [varname]==0, "Considered", "Not considered"))
)
Is there an easier way to do this in R / dplyr?
Edit: Sorry for not providing enough detail before (new here, but really appreciate the fast responses!). Here is a sample of the data
structure(list(p_age = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0), label = "Age", class = c("labelled",
"numeric")), p_age_inc = structure(c(1, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0
), label = "Age", class = c("labelled", "numeric")), p_sex = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 0), label = "Sex", class = c("labelled", "numeric"
)), p_sex_inc = structure(c(1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), label = "Sex", class = c("labelled",
"numeric")), p_nation = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 0,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), label = "Nationality / country", class = c("labelled",
"numeric")), p_nation_inc = structure(c(0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0), label = "Nationality / country", class = c("labelled", "numeric"
)), p_prevtb = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), label = "Treatment regimen / treatment status (retreatment)", class = c("labelled",
"numeric")), p_prevtb_inc = structure(c(0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "Previous TB / retreated TB", class = c("labelled",
"numeric"))), row.names = c(NA, 50L), class = "data.frame")
The first 5 rows (with 4 sets of selected predictors) looks like this:
p_age p_age_inc p_sex p_sex_inc p_nation p_nation_inc p_prevtb
1 1 1 1 1 0 0 0
2 1 0 1 0 0 0 0
3 1 0 1 1 0 0 0
4 1 1 1 1 0 0 0
5 1 1 1 0 1 0 1
6 1 1 1 0 1 0 1
p_prevtb_inc
1 0
2 0
3 0
4 0
5 0
6 0
And I'd like to create the new variables like this:
p_age p_age_inc p_sex p_sex_inc p_nation p_nation_inc p_prevtb
1 1 1 1 1 0 0 0
2 1 0 1 0 0 0 0
3 1 0 1 1 0 0 0
4 1 1 1 1 0 0 0
5 1 1 1 0 1 0 1
6 1 1 1 0 1 0 1
p_prevtb_inc age sex nation prevtb
1 0 Included Included Not considered Not considered
2 0 Considered Considered Not considered Not considered
3 0 Considered Included Not considered Not considered
4 0 Included Included Not considered Not considered
5 0 Included Considered Considered Considered
6 0 Included Considered Considered Considered
This solution could be improved upon but it works. The function does what the question asks for creating the variables in a standard for loop over the p_* variables. And then returns the result.
Argument Bind can be used to return just the newly created variables by setting Bind = FALSE.
create_var <- function(X, Bind = TRUE){
xnames <- names(X)
p_only <- grep('p_([^_]+$)', xnames, value = TRUE)
res <- vector('list', length = length(p_only))
for(i in seq_along(p_only)){
x <- X[[ p_only[i] ]]
y <- X[[paste0(p_only[i], '_inc')]]
res[[i]] <- case_when(
as.logical(x) & as.logical(y) ~ "Included",
as.logical(x) & !as.logical(y) ~ "Considered",
!as.logical(x) ~ "Not considered",
TRUE ~ "Not considered"
)
}
names(res) <- sub('^p_', '', p_only)
res <- do.call(cbind.data.frame, res)
if(Bind) cbind(X, res) else res
}
create_var(df1)
df1 %>% create_var()
df1 %>% create_var(Bind = FALSE)