Code multiple levels as 2 factor labels - r

I have a data frame with some columns:
that I want to transform into a factor,
in which the different levels are coded as -2, -1, 0, 1, 2, 3, 4
for which I want the levels to be labeled as 0 or 1 following this convention:
-2 = 1
-1 = 1
0 = 0
1 = 1
2 = 1
3 = 1
4 = 0
I have the following code:
#Convert to factor
dat[idx] <- lapply(dat[idx], factor, levels = -2:4, labels = c(1, 1, 0, 1, 1, 1, 0))
#Drop unused factor levels
dat <- droplevels(dat)
This works, but it gives me the following warning:
In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
duplicated levels in factors are deprecated
I tried the following code (per Ananda Mahto's suggestion) but no luck:
levels(dat[idx]) <- list(`0` = c(0, 4), `1` = c(-2, -1, 1, 2, 3))
I figured there has to be a better way to do this, any suggestions?
My data looks like this:
structure(list(Timestamp = structure(c(1380945601, 1380945603,
1380945605, 1380945607, 1380945609, 1380945611, 1380945613, 1380945615,
1380945617, 1380945619), class = c("POSIXct", "POSIXt"), tzone = ""),
FCB2C01 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), RCB2C01 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C02 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C02 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C03 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), RCB2C03 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C04 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C04 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C05 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C05 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C06 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C06 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C07 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C07 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C08 = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1), RCB2C08 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FCB2C09 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C09 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), FCB2C10 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), RCB2C10 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Timestamp", "FCB2C01",
"RCB2C01", "FCB2C02", "RCB2C02", "FCB2C03", "RCB2C03", "FCB2C04",
"RCB2C04", "FCB2C05", "RCB2C05", "FCB2C06", "RCB2C06", "FCB2C07",
"RCB2C07", "FCB2C08", "RCB2C08", "FCB2C09", "RCB2C09", "FCB2C10",
"RCB2C10"), row.names = c(NA, 10L), class = "data.frame")
And the column index:
idx <- seq(2,21,2)

If I correctly understand what you want to do, the "right" way would be to use the levels function to specify your levels. Compare the following:
set.seed(1)
x <- sample(-2:4, 10, replace = TRUE)
YourApproach <- factor(x, levels = -2:4, labels = c(1, 1, 0, 1, 1, 1, 0))
# Warning message:
# In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
# duplicated levels in factors are deprecated
YourApproach
# [1] 1 0 1 0 1 0 0 1 1 1
# Levels: 1 1 0 1 1 1 0
xFac <- factor(x, levels = -2:4)
levels(xFac) <- list(`0` = c(0, 4), `1` = c(-2, -1, 1, 2, 3))
xFac
# [1] 1 0 1 0 1 0 0 1 1 1
# Levels: 0 1
Note the difference in the "Levels" in each of those. This also means that the underlying numeric representation is going to be different:
> as.numeric(YourApproach)
[1] 2 3 5 7 2 7 7 5 5 1
> as.numeric(xFac)
[1] 2 1 2 1 2 1 1 2 2 2

Related

Count the frequency of concecutive zeros in a every time they appear in a each row

I have this dataframe and would like to compute a count of zero sequences every time they appear in a row so that the output would be A: 2 4, B:1 2 1, C:2 5, D: 2 3, E: 1 1
df <- data.frame(
A=c(1, 0, 0, 1, 1, 0, 0, 0, 0),
B=c(0, 1, 1, 0, 0, 1, 0, 1, 1),
C=c(0, 0, 1, 1, 0, 0, 0, 0, 0),
D=c(0, 0, 1, 1, 1, 1, 0, 0, 0),
E=c(1, 0, 1, 1, 1, 1, 0, 1, 1)
)
We may use rle by looping over the columns of the data.frame and get the lengths of the 0 values in base R
lapply(df1, function(x) with(rle(x), lengths[!values]))
-output
$A
[1] 2 4
$B
[1] 1 2 1
$C
[1] 2 5
$D
[1] 2 3
$E
[1] 1 1
data
df1 <- structure(list(A = c(1, 0, 0, 1, 1, 0, 0, 0, 0), B = c(0, 1,
1, 0, 0, 1, 0, 1, 1), C = c(0, 0, 1, 1, 0, 0, 0, 0, 0), D = c(0,
0, 1, 1, 1, 1, 0, 0, 0), E = c(1, 0, 1, 1, 1, 1, 0, 1, 1)), row.names = c(NA,
-9L), class = "data.frame")

How to use dplyr to filter rows where value in a specific column is 1 and all the rest are 0?

Using dplyr functions, I want to remove rows in which only column b equals 1 and the rest of columns are all 0.
Although I can do this:
library(dplyr, warn.conflicts = FALSE)
trb <-
tribble(~a, ~b, ~c,
1, 1, 1,
1, 1, 0,
1, 0, 1,
0, 1, 0, # <~~~ remove this
0, 0, 0,
0, 1, 0 # <~~~ remove this
)
trb %>%
filter(!(b == 1 & a == 0 & c == 0))
#> # A tibble: 4 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 0
#> 3 1 0 1
#> 4 0 0 0
I'm looking for a more scalable solution to account for data such as:
trb_2 <-
tibble::tribble(
~a, ~b, ~c, ~d, ~e, ~f, ~g, ~h, ~i, ~j, ~k, ~l, ~m, ~n, ~o, ~p, ~q, ~r, ~s, ~t, ~u, ~v, ~w, ~x, ~y, ~z,
0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1
)
In trb_2 I still want to remove the rows in which b equals 1 and all the rest are 0.
Is there a scalable way to achieve this using dplyr::filter()?
Yes, using the new helper function dplyr::if_all() you can do this for no matter how many columns you have:
trb %>%
filter(!(b == 1 & if_all(-b, ~ .x == 0)))
Result:
# A tibble: 4 x 3
a b c
<dbl> <dbl> <dbl>
1 1 1 1
2 1 1 0
3 1 0 1
4 0 0 0
Breakdown of !(b == 1 & if_all(-b, ~ .x == 0)):
b == 1 will match rows where b is 1
if_all(-b, ~ .x == 0) will match rows where all columns except b are exactly 0
!(b == 1 & if_all(-b, ~ .x == 0)) combines these two expressions and removes the rows where both are true
trb %>%
filter(b != 1 | rowSums(. == 1) != 1)
# # A tibble: 4 x 3
# a b c
# <dbl> <dbl> <dbl>
# 1 1 1 1
# 2 1 1 0
# 3 1 0 1
# 4 0 0 0

Is there an R function similar to foreach loops in Stata for creating new variables based on the name (or root) of existing variables?

I have a list of 60 variables (30 pairs, essentially), and I need to combine the information across all the pairs to create new variables based on the data stored in each pair.
To give some context, I am working on a systematic review of prediction model studies, and I extracted data on which variables were considered for inclusion in the prediction model of each study (the first 30 variables) and which variables were included in the model (the second 30 variables)
All variables are binary.
The first 30 variables are written in the form “p_[varname]”
The second 30 are written in the form “p_[varname]_inc”.
I want to create a new variable that is called [varname] and takes the values “Not considered”, “Considered”, and “Included”.
In Stata, I could easily do this like so:


foreach v of [varname1]-[varname30] {
gen `v' = "Not considered" if p_`v' == 0
replace `v' = "Considered" if p_`v' == 1 & p_`v'_inc == 0
replace `v' = "Included" if p_`v'_inc == 1 & p_`v'_inc == 1
}
In R, the only way I can figure out to do it is by copy and pasting the same ifelse statement for all variables, for example:
predictor_vars %>%
mutate(age = ifelse(p_age==1 & p_age_inc==1, "Included",
ifelse(p_age==1 & p_age_inc==0, "Considered", "Not considered")),
sex = ifelse(p_sex==1 & p_sex_inc==1, "Included",
ifelse(p_sex==1 & p_sex_inc==0, "Considered", "Not considered")),
....
[varname] = ifelse([varname]==1 & [varname]_inc==1, "Included",
ifelse([varname]==1 & [varname]==0, "Considered", "Not considered"))
)
Is there an easier way to do this in R / dplyr?
Edit: Sorry for not providing enough detail before (new here, but really appreciate the fast responses!). Here is a sample of the data
structure(list(p_age = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0), label = "Age", class = c("labelled",
"numeric")), p_age_inc = structure(c(1, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0
), label = "Age", class = c("labelled", "numeric")), p_sex = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 0), label = "Sex", class = c("labelled", "numeric"
)), p_sex_inc = structure(c(1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), label = "Sex", class = c("labelled",
"numeric")), p_nation = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 0,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), label = "Nationality / country", class = c("labelled",
"numeric")), p_nation_inc = structure(c(0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0), label = "Nationality / country", class = c("labelled", "numeric"
)), p_prevtb = structure(c(0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), label = "Treatment regimen / treatment status (retreatment)", class = c("labelled",
"numeric")), p_prevtb_inc = structure(c(0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "Previous TB / retreated TB", class = c("labelled",
"numeric"))), row.names = c(NA, 50L), class = "data.frame")
The first 5 rows (with 4 sets of selected predictors) looks like this:
p_age p_age_inc p_sex p_sex_inc p_nation p_nation_inc p_prevtb
1 1 1 1 1 0 0 0
2 1 0 1 0 0 0 0
3 1 0 1 1 0 0 0
4 1 1 1 1 0 0 0
5 1 1 1 0 1 0 1
6 1 1 1 0 1 0 1
p_prevtb_inc
1 0
2 0
3 0
4 0
5 0
6 0
And I'd like to create the new variables like this:
p_age p_age_inc p_sex p_sex_inc p_nation p_nation_inc p_prevtb
1 1 1 1 1 0 0 0
2 1 0 1 0 0 0 0
3 1 0 1 1 0 0 0
4 1 1 1 1 0 0 0
5 1 1 1 0 1 0 1
6 1 1 1 0 1 0 1
p_prevtb_inc age sex nation prevtb
1 0 Included Included Not considered Not considered
2 0 Considered Considered Not considered Not considered
3 0 Considered Included Not considered Not considered
4 0 Included Included Not considered Not considered
5 0 Included Considered Considered Considered
6 0 Included Considered Considered Considered
This solution could be improved upon but it works. The function does what the question asks for creating the variables in a standard for loop over the p_* variables. And then returns the result.
Argument Bind can be used to return just the newly created variables by setting Bind = FALSE.
create_var <- function(X, Bind = TRUE){
xnames <- names(X)
p_only <- grep('p_([^_]+$)', xnames, value = TRUE)
res <- vector('list', length = length(p_only))
for(i in seq_along(p_only)){
x <- X[[ p_only[i] ]]
y <- X[[paste0(p_only[i], '_inc')]]
res[[i]] <- case_when(
as.logical(x) & as.logical(y) ~ "Included",
as.logical(x) & !as.logical(y) ~ "Considered",
!as.logical(x) ~ "Not considered",
TRUE ~ "Not considered"
)
}
names(res) <- sub('^p_', '', p_only)
res <- do.call(cbind.data.frame, res)
if(Bind) cbind(X, res) else res
}
create_var(df1)
df1 %>% create_var()
df1 %>% create_var(Bind = FALSE)

How to find bounding boxes of objects in raster?

I have a binary raster consisting of objects (1) and background (0). How can I find bounding boxes of objects? Each object should have its own bouding box.
Input:
library("raster")
mat = matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0,
0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE
)
ras = raster(mat)
I expect this result:
result = raster(matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0,
0, 1, 0, 0, 1, 0,
0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE
))
Here in an approach
Example data
library(raster)
mat = matrix(
c(0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0,
0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0),
ncol = 6, nrow = 8, byrow = TRUE )
ras <- raster(mat)
Solution
f <- function(r) {
x <- reclassify(ras, cbind(0,NA))
y <- rasterToPolygons(x, dissolve=TRUE)
z <- disaggregate(y)
e <- sapply(1:length(z), function(i) extent(z[i,]))
p <- spPolygons(e)
r <- rasterize(p, r)
d <- boundaries(r)
reclassify(d, cbind(NA, 0))
}
r <- f(res)
as.matrix(r)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 0 0 0 0 0 0
#[2,] 0 1 1 1 1 0
#[3,] 0 1 1 1 1 0
#[4,] 0 0 0 0 0 0
#[5,] 0 1 1 1 1 0
#[6,] 0 1 0 0 1 0
#[7,] 0 1 1 1 1 0
#[8,] 0 0 0 0 0 0
It is of course possible that bounding boxes of objects overlap, in which there is no solution, I suppose.

How to frequency of consecutive rows with the same number for several columns

I have a dataset as follows:
structure(list(chr = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
1, 0, 0, 1, 1, 1, 1), leftPos = c(240000, 1080000, 1200000, 1320000,
1440000, 1800000, 2400000, 2520000, 3120000, 3360000, 3480000,
3600000, 3720000, 4200000, 4560000, 4920000, 5040000, 5160000,
5280000, 6e+06), chr.1 = c(1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 1, 1, 1), leftPos.1 = c(240000, 1080000, 1200000,
1320000, 1440000, 1800000, 2400000, 2520000, 3120000, 3360000,
3480000, 3600000, 3720000, 4200000, 4560000, 4920000, 5040000,
5160000, 5280000, 6e+06), ASample = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0), Sample1 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1), Sample2 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1), Sample3 = c(0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1), Sample4 = c(0,
0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample5 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample6 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample7 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1), Sample8 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample9 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1), Sample10 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample11 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1), Sample12 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample13 = c(0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0), Sample14 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1), Sample15 = c(0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1)), .Names = c("chr",
"leftPos", "chr.1", "leftPos.1", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15"), row.names = c(NA,
20L), class = "data.frame")
I need to count the number of rows per column where there is more than one of the same 1 or -1
I would like to be able to count the number of consecutive rows for each column, grouped by chr, that have three consecutive 1 or -1 within a chromosome (column called chr).
The ideal output would be something like (not taken from the dput data above)
chr numberOfConsecutive1s FreqSample1 FreqSample2 FreqSample3 etc
1 2 3 2 14
1 3 5 2 2
1 4 5 0 6
1 5 4 3 5
1 6 3 0 3
1 7 7 5 7
1 8 5 0 2
1 9 54 2 6
1 10 34 77 7
2 2 6 4 2
2 3 23 34 34
2 4 5 37 2
2 5 55 24 22
2 6 2 0 11
2 7 3 14 5
2 8 2 5 77
2 9 5 23 34
2 10 5 11 34
3 1 32 0 2
So far I have tried the following which simply converts non consecutive 1s into 0s so I am left with the consecutive 1s only. I don't know how to count them up as per the desired output.
dx<-DAT_list2res
f0 <- function( colNr, dx )
{
col <- dx[,colNr]
n1 <- which( col == 1 ) # The `1`-rows.
d0 <- which( diff(col) == 0 ) # Consecutive entries are equal.
dc0 <- which( diff(dx[,1]) == 0 ) # Same chromosome.
m <- intersect( n1-1, intersect( d0, dc0 ) )
return ( setdiff( 1:nrow(dx), union(m,m+1) ) )
}
g <- function( dx )
{
for ( i in 3:ncol(dx) ) { dx[f0(i,dx),i] <- 0 }
return ( dx )
}
dx<-g(dx)
EDIT
I also tried this as suggested by bramtayl:
result =
consecFreq %>%
select(-chr) %>%
gather(variable, chr, 5:190) %>%
group_by(variable) %>%
mutate(ID =
chr %>%
lag %>%
`!=`(chr) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
count(variable, chr, ID) %>%
rename(numberOfConsecutive1s = n) %>%
count(variable, chr, numberOfConsecutive1s) %>%
spread(variable, n, fill = 0)
but it gives me an 'index out of bounds' error. If I ignore the spread line I get an odd output as well so I'm not sure this is the answer
REVISED
Based on clarifications, this approach uses the rle function for each chromosome to find runs of consecutive 1's or -1's and then table to count the number of runs for each value. This gives NA for samples which have no counts for a particular value so the last line of the code converts the NA's to 0's if this is helpful. Finally there seems to be a problem with your structure input in that Cytospongex10_SLX.9395.FastSeqK.fq.gz.res is missing from the .Names section of structure. This causes all the column names to be shifted and the last column name to be NA which can cause problems in the execution.
The code below assigns the correct names to the input data (in data.frame df) and then calculates the frequencies as described above.
colnames(data) <- c("chr",
"leftPos", "chr.1", "leftPos.1", "Cytospongex10_SLX.9395.FastSeqK.fq.gz.res", "Sample1",
"Sample2",
"Sample3", "Sample4",
"Sample5", "Sample6",
"Sample7", "Sample8",
"Sample9", "Sample10",
"Sample11", "Sample12",
"Sample13", "Sample14",
"Sample15")
chr_labels <- sort(unique(data$chr))
sampl_freqs <- data.frame(chr=1, numberOfConsecutive1s=1, count=0)
for( sampl in colnames(data)[-(1:5)]) {
freqs <- data.frame()
for( chr in chr_labels ) {
runs <- rle(data[data$chr == chr,sampl])
freqs_chr <- data.frame(chr=chr, table(runs$length[runs$values %in% c(-1,1)], dnn = "numberOfConsecutive1s") )
freqs <- rbind(freqs, freqs_chr)
}
sampl_freqs <- merge.data.frame(sampl_freqs, freqs, by = c("chr","numberOfConsecutive1s"), all=TRUE)
colnames(sampl_freqs) <- c(head(colnames(sampl_freqs),-1),paste("Freq",sampl,sep=""))
}
# clean up from sampl_freqs definition
sampl_freqs <- sampl_freqs[,-3]
# To convert NA's to 0
sampl_freqs <- data.frame(sampl_freqs[,1:2], sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x)))
Similar to above, but uses dplyr
library(reshape2)
library(dplyr)
df <- melt(data[,-(2:5)], id.vars="chr", variable.name="sample")
sampl_freqs <- df %>% group_by(sample, chr ) %>%
do(data.frame(unclass(rle(.$value))) %>%
filter(values %in% c(-1,1)) ) %>%
group_by(sample, chr, lengths) %>%
summarize(Freq = n() ) %>%
dcast( chr + lengths ~ sample, value.var = "Freq" )
sampl_freqs <- with(sampl_freqs,data.frame( chr, numberOfConsecutive1s = lengths ,
sapply(sampl_freqs[,-(1:2)], function(x) ifelse(is.na(x), 0, x))))
I think you want something like this:
library(dplyr)
library(tidyr)
min_chunk_length = 1
result =
data %>%
rename(chromosome = chr) %>%
select(chromosome, Sample1:Sample15) %>%
gather(sample, value, Sample1:Sample15) %>%
group_by(chromosome, sample) %>%
mutate(non_zero = value %in% c(1, -1),
chunk_ID =
non_zero %>%
lag %>%
`!=`(non_zero) %>%
plyr::mapvalues(NA, FALSE) %>%
cumsum) %>%
filter(non_zero = TRUE) %>%
group_by(chromosome, sample, chunk_ID) %>%
mutate(length_of_chunk = n()) %>%
filter(length_of_chunk > min_chunk_length) %>%
count(chromosome, sample) %>%
spread(sample, n, fill = 0)

Resources