Suppose I have a data frame that looks like this.
# start end motif
# 2 6 a
# 10 15 b
# 30 35 c
How would I create a data frame that fills in the remaining start and end locations like so up to a certain number Max_end:
Max_end <- 33
# start end motif
# 0 2 na # <- 0-2 are filled in because it is not in the original data frame
# 2 6 a # <- 2-6 are in the original
# 6 10 na # <- 6-10 is not
# 10 15 b # <- 10-15 is
# 15 30 na # and so on
# 30 33 c
And further, calculates the distance between the start and end locations and creates a one column data frame.
# Length motif
# 2 na
# 4 a
# 4 na
# 5 b
# 15 na
# 3 c
Currently this is how i am doing it: It is very inefficient
library(data.table)
library(stringi)
f <- fread('ABC.txt',header=F,skip=1)$V1
f <- paste(f, collapse = "")
motifs = c('GATC', 'CTGCAG', 'ACCACC', 'CC(A|T)GG', 'CCAC.{8}TGA(C|T)')
v <- na.omit(data.frame(do.call(rbind, lapply(stri_locate_all_regex(f, motifs), unlist))))
v <- v[order(v[,1]),]
v2difference <- "blah"
for(i in 2:nrow(v)){
if(v[i,1] > v[i-1,2]+2){v2difference[i] <- v[i,1]-v[i-1,2]-2}
}
v2difference[1] <- v[1,1]
v2 <- data.frame(Order=seq(1, 2*nrow(v), 2),Lengths=matrix(v2difference, ncol = 1),Motifs="na")
v1 <- data.frame(Order=seq(2, 2*nrow(v), 2),Lengths=(v$end-v$start+1),Motifs=na.omit(unlist(stri_extract_all_regex(f,motifs))))
V <- data.frame(Track=1,rbind(v1,v2))
V <- V[order(V$Order),]
B <- V[,!(names(V) %in% "Order")]
Max_end <- 33
breaks <- c(0, t(as.matrix(dat[,1:2])), Max_end) # get endpoints
breaks <- breaks[breaks <= Max_end]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all=T)
# start end motif
# 1 0 2 <NA>
# 2 2 6 a
# 3 6 10 <NA>
# 4 10 15 b
# 5 15 30 <NA>
# 6 30 33 <NA>
# 7 30 35 c
To specify a start and endpoint, you could do
Max_end <- 33
Max_start <- 10
breaks <- unique(c(Max_start, t(as.matrix(dat[,1:2])), Max_end))
breaks <- breaks[breaks <= Max_end & breaks >= Max_start]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all.y=T)
# start end motif
# 1 10 15 b
# 2 15 30 <NA>
# 3 30 33 <NA>
Note: this doesn't include "c" in the shortened final interval, you would need to decide if that values gets included or not when the interval changes.
Related
I generated a data frame (df) in R (see below). If I use the column "x2" instead of "x2a" to make the data frame everything works well. However, as soon as I use "x2a" instead of "x2" I get an error because the input of "x2a" is of various lengths. Do you have an idea how I can change the code that it is going to work with column "x2a"?
Error message with "x2a":
Error in data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), :
arguments imply differing number of rows: 3, 16
Code to reproduce the data frame and error
x1 <- c("A", "B", "C")
x2 <- c("[1,3,5,6,7]","[5,7,8,9,10]","[3,4,5,8,9]")
x2a <- c("[1,3,5]","[5,7,8,9,10, 20, 30, 24]","[3,4,5,8,9]")
x3 <- c(8000, 74555, 623334)
x4 <- c(9000, 76000, 623500)
df <- data.frame(cbind(x1, x2a, x3, x4))
colnames(df) <- c("Id", "Noise", "Start", "End")
df$Start <- as.numeric(as.character(df$Start))
df$End <- as.numeric(as.character(df$End))
# remove square brackets
df$Noise <- gsub("\\[|\\]", "", df$Noise)
# split
split_it <- strsplit(df$Noise, split = ",")
df_2 <- data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), End = rep(df$End))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
base R
What I'm inferring you want is not something R can "intuit" for you: you want it to repeat the values in Id based on the number of elements found when strsplit did its work. (How should R know to look in one object and arbitrarily repeat another?)
Try using rep(., times=.) to specify how many times each element of Id (etc) should be repeated in order to stay "in step" with Noise.
# split
split_it <- strsplit(df$Noise, split = ",")
n <- lengths(split_it)
print(n)
# [1] 3 8 5
df_2 <- data.frame(Id = rep(df$Id, times=n),
Noise = unlist(split_it),
Start = rep(df$Start, times=n),
End = rep(df$End, times=n))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
df_2
# Id Noise Start End
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500
dplyr
library(dplyr)
df %>%
mutate(Noise = strsplit(Noise, split = ",")) %>%
unnest(Noise) %>%
mutate(Noise = as.integer(Noise)) # I'm inferring this is desired, not required
# # A tibble: 16 x 4
# Id Noise Start End
# <chr> <int> <dbl> <dbl>
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500
In a clustered dataset, I want to randomly pick some clusters and then add some simulated observations to the selected clusters. Then I want to create a dataset that combines the simulated and original observations from the selected clusters with all the original observations from the unselected clusters. I would also like to repeat this process many times and thus create many (maybe 1000) new datasets. I managed to do this using for loop but would like to know if there is any more efficient and concise way to accomplish this. Here is an example dataset:
## simulate some data
y <- rnorm(20)
x <- rnorm(20)
z <- rep(1:5, 4)
w <- rep(1:4, each=5)
dd <- data.frame(id=z, cluster=w, x=x, y=y)
# id cluster x y
# 1 1 1 0.30003855 0.65325768
# 2 2 1 -1.00563626 -0.12270866
# 3 3 1 0.01925927 -0.41367651
# 4 4 1 -1.07742065 -2.64314895
# 5 5 1 0.71270333 -0.09294102
# 6 1 2 1.08477509 0.43028470
# 7 2 2 -2.22498770 0.53539884
# 8 3 2 1.23569346 -0.55527835
# 9 4 2 -1.24104450 1.77950291
# 10 5 2 0.45476927 0.28642442
# 11 1 3 0.65990264 0.12631586
# 12 2 3 -0.19988983 1.27226678
# 13 3 3 -0.64511396 -0.71846622
# 14 4 3 0.16532102 -0.45033862
# 15 5 3 0.43881870 2.39745248
# 16 1 4 0.88330282 0.01112919
# 17 2 4 -2.05233698 1.63356842
# 18 3 4 -1.63637927 -1.43850664
# 19 4 4 1.43040234 -0.19051680
# 20 5 4 1.04662885 0.37842390
cl <- split(dd, dd$cluster) ## split the data based on clusters
k <- length(dd$id)
l <- length(cl)
`%notin%` <- Negate(`%in%`) ## define "not in" to exclude unselected clusters so
## as to retain their original observations
A clsamp function in the following code is then created which includes two for loops. The first for loop is to exclude the unselected clusters and the second for loop is to simulate new observations and append them to the selected clusters. Note that I randomly sample 2 clusters (10% of the total number of observations), without replacement
clsamp <- function(cl, k) {
a <- sample(cl, size=0.1*k, replace=FALSE)
jud <- (names(cl) %notin% names(a))
need <- names(cl)[jud]
T3 <- NULL
for (k in need) {
T3 <- rbind(T3, cl[[k]])
}
subt <- NULL
s <- a
for (j in 1:2) {
y <- rnorm(2)
x <- rnorm(2)
d <- cbind(id=nrow(a[[j]]) + c(1:length(x)),
cluster=unique(a[[j]]$cluster), x, y)
s[[j]] <- rbind(a[[j]], d)
subt <- rbind(subt, s[[j]])
}
T <- rbind(T3, subt)
return(T)
}
Finally, this creates a list of 5 datasets each of which combines the simulated and original observations from the selected clusters with all the original observations from the unselected clusters
Q <- vector(mode="list", length=5)
for (i in 1:length(Q)) {
Q[[i]] <- clsamp(cl, 20)
}
Anyone knows a shorter way to do this? Maybe use the replicate function? Thanks.
This generates a sizeX2 matrix of random values and cbinds sampled cluster names and consecutive ids to it. It directly starts with dd and also works when you convert dd to a matrix mm, which might be slightly faster. Output is a data frame, though. Instead of your k I use f to directly calculate the number of rows that should be added to the two selected clusters. In case the size gets zero, the original data frame is returned.
clsamp2 <- function(m, f=.1) {
size <- round(nrow(m)*f)
if (size == 0) as.data.frame(m)
else {
ids <- unique(m[,1])
cls <- unique(m[,2])
rd <- matrix(rnorm(size * 4), ncol=2, dimnames=list(NULL, c("x", "y")))
out <- rbind.data.frame(m, cbind(id=rep(max(ids) + 1:size, each=2),
cluster=sample(cls, 2), rd))
`rownames<-`(out[order(out$cluster, out$id), ], NULL)
}
}
Result
set.seed(42) ## same seed also used for creating `dd`
clsamp2(dd, .1)
## or
mm <- as.matrix(dd)
clsamp2(mm, .1)
# id cluster x y
# 1 1 1 -0.30663859 1.37095845
# 2 2 1 -1.78130843 -0.56469817
# 3 3 1 -0.17191736 0.36312841
# 4 4 1 1.21467470 0.63286260
# 5 5 1 1.89519346 0.40426832
# 6 1 2 -0.43046913 -0.10612452
# 7 2 2 -0.25726938 1.51152200
# 8 3 2 -1.76316309 -0.09465904
# 9 4 2 0.46009735 2.01842371
# 10 5 2 -0.63999488 -0.06271410
# 11 6 2 1.37095845 0.40426832
# 12 7 2 0.36312841 1.51152200
# 13 1 3 0.45545012 1.30486965
# 14 2 3 0.70483734 2.28664539
# 15 3 3 1.03510352 -1.38886070
# 16 4 3 -0.60892638 -0.27878877
# 17 5 3 0.50495512 -0.13332134
# 18 1 4 -1.71700868 0.63595040
# 19 2 4 -0.78445901 -0.28425292
# 20 3 4 -0.85090759 -2.65645542
# 21 4 4 -2.41420765 -2.44046693
# 22 5 4 0.03612261 1.32011335
# 23 6 4 -0.56469817 -0.10612452
# 24 7 4 0.63286260 -0.09465904
To create the list of five samples, you may use replicate.
replicate(5, clsamp2(dd, .1), simplify=FALSE)
Running time is negligible.
system.time(replicate(1000, clsamp2(dd, .1), simplify=FALSE))
# user system elapsed
# 0.44 0.03 0.44
I have a dataframe, say
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
I want to remove only those rows in which one or multiple ts are directly in between a d and a c, in all other cases I want to retain the cases. So for this example, I would like to remove the ts on row 8, 18 and 19, but keep the others. I have over thousands of cases so doing this manually would be a true horror. Any help is very much appreciated.
One option would be to use rle to get runs of the same string and then you can use an sapply to check forward/backward and return all the positions you want to drop:
rle_vals <- rle(as.character(df$x))
drop <- unlist(sapply(2:length(rle_vals$values), #loop over values
function(i, vals, lengths) {
if(vals[i] == "t" & vals[i-1] == "d" & vals[i+1] == "c"){#Check if value is "t", previous is "d" and next is "c"
(sum(lengths[1:i-1]) + 1):sum(lengths[1:i]) #Get row #s
}
},vals = rle_vals$values, lengths = rle_vals$lengths))
drop
#[1] 8 18 19
df[-drop,]
# x y
#1 a 2
#2 a 4
#3 b 5
#4 b 2
#5 b 6
#6 c 2
#7 d 4
#9 c 2
#10 b 6
#11 t 2
#12 c 4
#13 t 5
#14 a 2
#15 a 6
#16 b 2
#17 d 4
#20 c 6
This also works, by collapsing to a string, identifying groups of t's between d and c (or c and d - not sure whether you wanted this option as well), then working out where they are and removing the rows as appropriate.
df = data.frame(x=c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y=c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6),stringsAsFactors = FALSE)
dfs <- paste0(df$x,collapse="") #collapse to a string
dfs2 <- do.call(rbind,lapply(list(gregexpr("dt+c",dfs),gregexpr("ct+d",dfs)),
function(L) data.frame(x=L[[1]],y=attr(L[[1]],"match.length"))))
dfs2 <- dfs2[dfs2$x>0,] #remove any -1 values (if string not found)
drop <- unlist(mapply(function(a,b) (a+1):(a+b-2),dfs2$x,dfs2$y))
df2 <- df[-drop,]
Here is another solution with base R:
df = data.frame(x = c("a","a","b","b","b","c","d","t","c","b","t","c","t","a","a","b","d","t","t","c"),
y = c(2,4,5,2,6,2,4,5,2,6,2,4,5,2,6,2,4,5,2,6))
#
s <- paste0(df$x, collapse="")
L <- c(NA, NA)
while (TRUE) {
r <- regexec("dt+c", s)[[1]]
if (r[1]==-1) break
L <- rbind(L, c(pos=r[1]+1, length=attr(r, "match.length")-2))
s <- sub("d(t+)c", "x\\1x", s)
}
L <- L[-1,]
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]
# > drop
# 8 18 19
# > df[-drop, ]
# x y
# 1 a 2
# 2 a 4
# 3 b 5
# 4 b 2
# 5 b 6
# 6 c 2
# 7 d 4
# 9 c 2
# 10 b 6
# 11 t 2
# 12 c 4
# 13 t 5
# 14 a 2
# 15 a 6
# 16 b 2
# 17 d 4
# 20 c 6
With gregexpr() it is shorter:
s <- paste0(df$x, collapse="")
g <- gregexpr("dt+c", s)[[1]]
L <- data.frame(pos=g+1, length=attr(g, "match.length")-2)
drop <- unlist(apply(L,1, function(x) seq(from=x[1], len=x[2])))
df[-drop, ]
I have a complete dataframe. I want to 20% of the values in the dataframe to be replaced by NAs to simulate random missing data.
A <- c(1:10)
B <- c(11:20)
C <- c(21:30)
df<- data.frame(A,B,C)
Can anyone suggest a quick way of doing that?
df <- data.frame(A = 1:10, B = 11:20, c = 21:30)
head(df)
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 15 25
## 6 6 16 26
as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 NA 25
## 6 6 16 26
## 7 NA 17 27
## 8 8 18 28
## 9 9 19 29
## 10 10 20 30
It's a random process, so it might not give 15% every time.
You can unlist the data.frame and then take a random sample, then put back in a data.frame.
df <- unlist(df)
n <- length(df) * 0.15
df[sample(df, n)] <- NA
as.data.frame(matrix(df, ncol=3))
It can be done a bunch of different ways using sample().
If you are in the mood to use purrr instead of lapply, you can also do it like this:
> library(purrr)
> df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
> df
A B C
1 1 11 21
2 2 12 22
3 3 13 23
4 4 14 24
5 5 15 25
6 6 16 26
7 7 17 27
8 8 18 28
9 9 19 29
10 10 20 30
> map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), size = length(x), replace = TRUE)]})
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 1 11 21
2 2 12 22
3 NA 13 NA
4 4 14 NA
5 5 15 25
6 6 16 26
7 7 17 27
8 8 NA 28
9 9 19 29
10 10 20 30
Same result, using binomial distribution:
dd=dim(df)
nna=20/100 #overall
df1<-df
df1[matrix(rbinom(prod(dd), size=1,prob=nna)==1,nrow=dd[1])]<-NA
df1
May i suggest a first function (ggNAadd) designed to do this, and improve it with a second function providing graphical distribution of the NAs created (ggNA)
What is neat is the possibility to input either a proportion of a fixed number of NAs.
ggNAadd = function(data, amount, plot=F){
temp <- data
amount2 <- ifelse(amount<1, round(prod(dim(data))*amount), amount)
if (amount2 >= prod(dim(data))) stop("exceeded data size")
for (i in 1:amount2) temp[sample.int(nrow(temp), 1), sample.int(ncol(temp), 1)] <- NA
if (plot) print(ggNA(temp))
return(temp)
}
And the plotting function:
ggNA = function(data, alpha=0.5){
require(ggplot2)
DF <- data
if (!is.matrix(data)) DF <- as.matrix(DF)
to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)),
'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
size <- 20 / log( prod(dim(DF)) ) # size of point depend on size of table
g <- ggplot(data=to.plot) + aes(x,y) +
geom_point(size=size, color="red", alpha=alpha) +
scale_y_reverse() + xlim(1,ncol(DF)) +
ggtitle("location of NAs in the data frame") +
xlab("columns") + ylab("lines")
pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
print(paste("percentage of NA data: ", pc))
return(g)
}
Which gives (using ggplot2 as graphical output):
ggNAadd(df, amount=0.20, plot=TRUE)
## [1] "percentage of NA data: 20"
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 NA 24
## ..
Of course, as mentioned earlier, if you ask too many NAs the actual percentage will drop because of repetitions.
A mutate_all approach:
df %>%
dplyr::mutate_all(~ifelse(sample(c(TRUE, FALSE), size = length(.), replace = TRUE, prob = c(0.8, 0.2)),
as.character(.), NA))
I have a complete dataframe. I want to 20% of the values in the dataframe to be replaced by NAs to simulate random missing data.
A <- c(1:10)
B <- c(11:20)
C <- c(21:30)
df<- data.frame(A,B,C)
Can anyone suggest a quick way of doing that?
df <- data.frame(A = 1:10, B = 11:20, c = 21:30)
head(df)
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 15 25
## 6 6 16 26
as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 NA 25
## 6 6 16 26
## 7 NA 17 27
## 8 8 18 28
## 9 9 19 29
## 10 10 20 30
It's a random process, so it might not give 15% every time.
You can unlist the data.frame and then take a random sample, then put back in a data.frame.
df <- unlist(df)
n <- length(df) * 0.15
df[sample(df, n)] <- NA
as.data.frame(matrix(df, ncol=3))
It can be done a bunch of different ways using sample().
If you are in the mood to use purrr instead of lapply, you can also do it like this:
> library(purrr)
> df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
> df
A B C
1 1 11 21
2 2 12 22
3 3 13 23
4 4 14 24
5 5 15 25
6 6 16 26
7 7 17 27
8 8 18 28
9 9 19 29
10 10 20 30
> map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), size = length(x), replace = TRUE)]})
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 1 11 21
2 2 12 22
3 NA 13 NA
4 4 14 NA
5 5 15 25
6 6 16 26
7 7 17 27
8 8 NA 28
9 9 19 29
10 10 20 30
Same result, using binomial distribution:
dd=dim(df)
nna=20/100 #overall
df1<-df
df1[matrix(rbinom(prod(dd), size=1,prob=nna)==1,nrow=dd[1])]<-NA
df1
May i suggest a first function (ggNAadd) designed to do this, and improve it with a second function providing graphical distribution of the NAs created (ggNA)
What is neat is the possibility to input either a proportion of a fixed number of NAs.
ggNAadd = function(data, amount, plot=F){
temp <- data
amount2 <- ifelse(amount<1, round(prod(dim(data))*amount), amount)
if (amount2 >= prod(dim(data))) stop("exceeded data size")
for (i in 1:amount2) temp[sample.int(nrow(temp), 1), sample.int(ncol(temp), 1)] <- NA
if (plot) print(ggNA(temp))
return(temp)
}
And the plotting function:
ggNA = function(data, alpha=0.5){
require(ggplot2)
DF <- data
if (!is.matrix(data)) DF <- as.matrix(DF)
to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)),
'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
size <- 20 / log( prod(dim(DF)) ) # size of point depend on size of table
g <- ggplot(data=to.plot) + aes(x,y) +
geom_point(size=size, color="red", alpha=alpha) +
scale_y_reverse() + xlim(1,ncol(DF)) +
ggtitle("location of NAs in the data frame") +
xlab("columns") + ylab("lines")
pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
print(paste("percentage of NA data: ", pc))
return(g)
}
Which gives (using ggplot2 as graphical output):
ggNAadd(df, amount=0.20, plot=TRUE)
## [1] "percentage of NA data: 20"
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 NA 24
## ..
Of course, as mentioned earlier, if you ask too many NAs the actual percentage will drop because of repetitions.
A mutate_all approach:
df %>%
dplyr::mutate_all(~ifelse(sample(c(TRUE, FALSE), size = length(.), replace = TRUE, prob = c(0.8, 0.2)),
as.character(.), NA))