I have a complete dataframe. I want to 20% of the values in the dataframe to be replaced by NAs to simulate random missing data.
A <- c(1:10)
B <- c(11:20)
C <- c(21:30)
df<- data.frame(A,B,C)
Can anyone suggest a quick way of doing that?
df <- data.frame(A = 1:10, B = 11:20, c = 21:30)
head(df)
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 15 25
## 6 6 16 26
as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 NA 25
## 6 6 16 26
## 7 NA 17 27
## 8 8 18 28
## 9 9 19 29
## 10 10 20 30
It's a random process, so it might not give 15% every time.
You can unlist the data.frame and then take a random sample, then put back in a data.frame.
df <- unlist(df)
n <- length(df) * 0.15
df[sample(df, n)] <- NA
as.data.frame(matrix(df, ncol=3))
It can be done a bunch of different ways using sample().
If you are in the mood to use purrr instead of lapply, you can also do it like this:
> library(purrr)
> df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
> df
A B C
1 1 11 21
2 2 12 22
3 3 13 23
4 4 14 24
5 5 15 25
6 6 16 26
7 7 17 27
8 8 18 28
9 9 19 29
10 10 20 30
> map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), size = length(x), replace = TRUE)]})
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 1 11 21
2 2 12 22
3 NA 13 NA
4 4 14 NA
5 5 15 25
6 6 16 26
7 7 17 27
8 8 NA 28
9 9 19 29
10 10 20 30
Same result, using binomial distribution:
dd=dim(df)
nna=20/100 #overall
df1<-df
df1[matrix(rbinom(prod(dd), size=1,prob=nna)==1,nrow=dd[1])]<-NA
df1
May i suggest a first function (ggNAadd) designed to do this, and improve it with a second function providing graphical distribution of the NAs created (ggNA)
What is neat is the possibility to input either a proportion of a fixed number of NAs.
ggNAadd = function(data, amount, plot=F){
temp <- data
amount2 <- ifelse(amount<1, round(prod(dim(data))*amount), amount)
if (amount2 >= prod(dim(data))) stop("exceeded data size")
for (i in 1:amount2) temp[sample.int(nrow(temp), 1), sample.int(ncol(temp), 1)] <- NA
if (plot) print(ggNA(temp))
return(temp)
}
And the plotting function:
ggNA = function(data, alpha=0.5){
require(ggplot2)
DF <- data
if (!is.matrix(data)) DF <- as.matrix(DF)
to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)),
'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
size <- 20 / log( prod(dim(DF)) ) # size of point depend on size of table
g <- ggplot(data=to.plot) + aes(x,y) +
geom_point(size=size, color="red", alpha=alpha) +
scale_y_reverse() + xlim(1,ncol(DF)) +
ggtitle("location of NAs in the data frame") +
xlab("columns") + ylab("lines")
pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
print(paste("percentage of NA data: ", pc))
return(g)
}
Which gives (using ggplot2 as graphical output):
ggNAadd(df, amount=0.20, plot=TRUE)
## [1] "percentage of NA data: 20"
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 NA 24
## ..
Of course, as mentioned earlier, if you ask too many NAs the actual percentage will drop because of repetitions.
A mutate_all approach:
df %>%
dplyr::mutate_all(~ifelse(sample(c(TRUE, FALSE), size = length(.), replace = TRUE, prob = c(0.8, 0.2)),
as.character(.), NA))
Related
I generated a data frame (df) in R (see below). If I use the column "x2" instead of "x2a" to make the data frame everything works well. However, as soon as I use "x2a" instead of "x2" I get an error because the input of "x2a" is of various lengths. Do you have an idea how I can change the code that it is going to work with column "x2a"?
Error message with "x2a":
Error in data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), :
arguments imply differing number of rows: 3, 16
Code to reproduce the data frame and error
x1 <- c("A", "B", "C")
x2 <- c("[1,3,5,6,7]","[5,7,8,9,10]","[3,4,5,8,9]")
x2a <- c("[1,3,5]","[5,7,8,9,10, 20, 30, 24]","[3,4,5,8,9]")
x3 <- c(8000, 74555, 623334)
x4 <- c(9000, 76000, 623500)
df <- data.frame(cbind(x1, x2a, x3, x4))
colnames(df) <- c("Id", "Noise", "Start", "End")
df$Start <- as.numeric(as.character(df$Start))
df$End <- as.numeric(as.character(df$End))
# remove square brackets
df$Noise <- gsub("\\[|\\]", "", df$Noise)
# split
split_it <- strsplit(df$Noise, split = ",")
df_2 <- data.frame(Id = rep(df$Id), Noise = unlist(split_it), Start = rep(df$Start), End = rep(df$End))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
base R
What I'm inferring you want is not something R can "intuit" for you: you want it to repeat the values in Id based on the number of elements found when strsplit did its work. (How should R know to look in one object and arbitrarily repeat another?)
Try using rep(., times=.) to specify how many times each element of Id (etc) should be repeated in order to stay "in step" with Noise.
# split
split_it <- strsplit(df$Noise, split = ",")
n <- lengths(split_it)
print(n)
# [1] 3 8 5
df_2 <- data.frame(Id = rep(df$Id, times=n),
Noise = unlist(split_it),
Start = rep(df$Start, times=n),
End = rep(df$End, times=n))
df_2 <- df_2[order(df_2$Id),]
rownames(df_2) <- NULL
df_2
# Id Noise Start End
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500
dplyr
library(dplyr)
df %>%
mutate(Noise = strsplit(Noise, split = ",")) %>%
unnest(Noise) %>%
mutate(Noise = as.integer(Noise)) # I'm inferring this is desired, not required
# # A tibble: 16 x 4
# Id Noise Start End
# <chr> <int> <dbl> <dbl>
# 1 A 1 8000 9000
# 2 A 3 8000 9000
# 3 A 5 8000 9000
# 4 B 5 74555 76000
# 5 B 7 74555 76000
# 6 B 8 74555 76000
# 7 B 9 74555 76000
# 8 B 10 74555 76000
# 9 B 20 74555 76000
# 10 B 30 74555 76000
# 11 B 24 74555 76000
# 12 C 3 623334 623500
# 13 C 4 623334 623500
# 14 C 5 623334 623500
# 15 C 8 623334 623500
# 16 C 9 623334 623500
I have two dataframes A and B, that share have the same column names and the same first column (Location)
A <- data.frame("Location" = 1:3, "X" = c(21,15, 7), "Y" = c(41,5, 5), "Z" = c(12,103, 88))
B <- data.frame("Location" = 1:3, "X" = c(NA,NA, 14), "Y" = c(50,8, NA), "Z" = c(NA,14, 12))
How do i replace the values in dataframe A with the values from B if the value in B is not NA?
Thanks.
We can use coalesce
library(dplyr)
A %>%
mutate(across(-Location, ~ coalesce(B[[cur_column()]], .)))
-output
# Location X Y Z
#1 1 21 50 12
#2 2 15 8 14
#3 3 14 5 12
Here's an answer in base R:
i <- which(!is.na(B),arr.ind = T)
A[i] <- B[i]
A
Location X Y Z
1 1 21 50 12
2 2 15 8 14
3 3 14 5 12
One option with fcoalesce from data.table pakcage
list2DF(Map(data.table::fcoalesce,B,A))
gives
Location X Y Z
1 1 21 50 12
2 2 15 8 14
3 3 14 5 12
I have a complete dataframe. I want to 20% of the values in the dataframe to be replaced by NAs to simulate random missing data.
A <- c(1:10)
B <- c(11:20)
C <- c(21:30)
df<- data.frame(A,B,C)
Can anyone suggest a quick way of doing that?
df <- data.frame(A = 1:10, B = 11:20, c = 21:30)
head(df)
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 15 25
## 6 6 16 26
as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 NA 25
## 6 6 16 26
## 7 NA 17 27
## 8 8 18 28
## 9 9 19 29
## 10 10 20 30
It's a random process, so it might not give 15% every time.
You can unlist the data.frame and then take a random sample, then put back in a data.frame.
df <- unlist(df)
n <- length(df) * 0.15
df[sample(df, n)] <- NA
as.data.frame(matrix(df, ncol=3))
It can be done a bunch of different ways using sample().
If you are in the mood to use purrr instead of lapply, you can also do it like this:
> library(purrr)
> df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
> df
A B C
1 1 11 21
2 2 12 22
3 3 13 23
4 4 14 24
5 5 15 25
6 6 16 26
7 7 17 27
8 8 18 28
9 9 19 29
10 10 20 30
> map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), size = length(x), replace = TRUE)]})
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 1 11 21
2 2 12 22
3 NA 13 NA
4 4 14 NA
5 5 15 25
6 6 16 26
7 7 17 27
8 8 NA 28
9 9 19 29
10 10 20 30
Same result, using binomial distribution:
dd=dim(df)
nna=20/100 #overall
df1<-df
df1[matrix(rbinom(prod(dd), size=1,prob=nna)==1,nrow=dd[1])]<-NA
df1
May i suggest a first function (ggNAadd) designed to do this, and improve it with a second function providing graphical distribution of the NAs created (ggNA)
What is neat is the possibility to input either a proportion of a fixed number of NAs.
ggNAadd = function(data, amount, plot=F){
temp <- data
amount2 <- ifelse(amount<1, round(prod(dim(data))*amount), amount)
if (amount2 >= prod(dim(data))) stop("exceeded data size")
for (i in 1:amount2) temp[sample.int(nrow(temp), 1), sample.int(ncol(temp), 1)] <- NA
if (plot) print(ggNA(temp))
return(temp)
}
And the plotting function:
ggNA = function(data, alpha=0.5){
require(ggplot2)
DF <- data
if (!is.matrix(data)) DF <- as.matrix(DF)
to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)),
'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
size <- 20 / log( prod(dim(DF)) ) # size of point depend on size of table
g <- ggplot(data=to.plot) + aes(x,y) +
geom_point(size=size, color="red", alpha=alpha) +
scale_y_reverse() + xlim(1,ncol(DF)) +
ggtitle("location of NAs in the data frame") +
xlab("columns") + ylab("lines")
pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
print(paste("percentage of NA data: ", pc))
return(g)
}
Which gives (using ggplot2 as graphical output):
ggNAadd(df, amount=0.20, plot=TRUE)
## [1] "percentage of NA data: 20"
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 NA 24
## ..
Of course, as mentioned earlier, if you ask too many NAs the actual percentage will drop because of repetitions.
A mutate_all approach:
df %>%
dplyr::mutate_all(~ifelse(sample(c(TRUE, FALSE), size = length(.), replace = TRUE, prob = c(0.8, 0.2)),
as.character(.), NA))
Suppose I have a data frame that looks like this.
# start end motif
# 2 6 a
# 10 15 b
# 30 35 c
How would I create a data frame that fills in the remaining start and end locations like so up to a certain number Max_end:
Max_end <- 33
# start end motif
# 0 2 na # <- 0-2 are filled in because it is not in the original data frame
# 2 6 a # <- 2-6 are in the original
# 6 10 na # <- 6-10 is not
# 10 15 b # <- 10-15 is
# 15 30 na # and so on
# 30 33 c
And further, calculates the distance between the start and end locations and creates a one column data frame.
# Length motif
# 2 na
# 4 a
# 4 na
# 5 b
# 15 na
# 3 c
Currently this is how i am doing it: It is very inefficient
library(data.table)
library(stringi)
f <- fread('ABC.txt',header=F,skip=1)$V1
f <- paste(f, collapse = "")
motifs = c('GATC', 'CTGCAG', 'ACCACC', 'CC(A|T)GG', 'CCAC.{8}TGA(C|T)')
v <- na.omit(data.frame(do.call(rbind, lapply(stri_locate_all_regex(f, motifs), unlist))))
v <- v[order(v[,1]),]
v2difference <- "blah"
for(i in 2:nrow(v)){
if(v[i,1] > v[i-1,2]+2){v2difference[i] <- v[i,1]-v[i-1,2]-2}
}
v2difference[1] <- v[1,1]
v2 <- data.frame(Order=seq(1, 2*nrow(v), 2),Lengths=matrix(v2difference, ncol = 1),Motifs="na")
v1 <- data.frame(Order=seq(2, 2*nrow(v), 2),Lengths=(v$end-v$start+1),Motifs=na.omit(unlist(stri_extract_all_regex(f,motifs))))
V <- data.frame(Track=1,rbind(v1,v2))
V <- V[order(V$Order),]
B <- V[,!(names(V) %in% "Order")]
Max_end <- 33
breaks <- c(0, t(as.matrix(dat[,1:2])), Max_end) # get endpoints
breaks <- breaks[breaks <= Max_end]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all=T)
# start end motif
# 1 0 2 <NA>
# 2 2 6 a
# 3 6 10 <NA>
# 4 10 15 b
# 5 15 30 <NA>
# 6 30 33 <NA>
# 7 30 35 c
To specify a start and endpoint, you could do
Max_end <- 33
Max_start <- 10
breaks <- unique(c(Max_start, t(as.matrix(dat[,1:2])), Max_end))
breaks <- breaks[breaks <= Max_end & breaks >= Max_start]
merge(dat, data.frame(start=breaks[-length(breaks)], end=breaks[-1]), all.y=T)
# start end motif
# 1 10 15 b
# 2 15 30 <NA>
# 3 30 33 <NA>
Note: this doesn't include "c" in the shortened final interval, you would need to decide if that values gets included or not when the interval changes.
I have a complete dataframe. I want to 20% of the values in the dataframe to be replaced by NAs to simulate random missing data.
A <- c(1:10)
B <- c(11:20)
C <- c(21:30)
df<- data.frame(A,B,C)
Can anyone suggest a quick way of doing that?
df <- data.frame(A = 1:10, B = 11:20, c = 21:30)
head(df)
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 15 25
## 6 6 16 26
as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 14 24
## 5 5 NA 25
## 6 6 16 26
## 7 NA 17 27
## 8 8 18 28
## 9 9 19 29
## 10 10 20 30
It's a random process, so it might not give 15% every time.
You can unlist the data.frame and then take a random sample, then put back in a data.frame.
df <- unlist(df)
n <- length(df) * 0.15
df[sample(df, n)] <- NA
as.data.frame(matrix(df, ncol=3))
It can be done a bunch of different ways using sample().
If you are in the mood to use purrr instead of lapply, you can also do it like this:
> library(purrr)
> df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
> df
A B C
1 1 11 21
2 2 12 22
3 3 13 23
4 4 14 24
5 5 15 25
6 6 16 26
7 7 17 27
8 8 18 28
9 9 19 29
10 10 20 30
> map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), size = length(x), replace = TRUE)]})
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 1 11 21
2 2 12 22
3 NA 13 NA
4 4 14 NA
5 5 15 25
6 6 16 26
7 7 17 27
8 8 NA 28
9 9 19 29
10 10 20 30
Same result, using binomial distribution:
dd=dim(df)
nna=20/100 #overall
df1<-df
df1[matrix(rbinom(prod(dd), size=1,prob=nna)==1,nrow=dd[1])]<-NA
df1
May i suggest a first function (ggNAadd) designed to do this, and improve it with a second function providing graphical distribution of the NAs created (ggNA)
What is neat is the possibility to input either a proportion of a fixed number of NAs.
ggNAadd = function(data, amount, plot=F){
temp <- data
amount2 <- ifelse(amount<1, round(prod(dim(data))*amount), amount)
if (amount2 >= prod(dim(data))) stop("exceeded data size")
for (i in 1:amount2) temp[sample.int(nrow(temp), 1), sample.int(ncol(temp), 1)] <- NA
if (plot) print(ggNA(temp))
return(temp)
}
And the plotting function:
ggNA = function(data, alpha=0.5){
require(ggplot2)
DF <- data
if (!is.matrix(data)) DF <- as.matrix(DF)
to.plot <- cbind.data.frame('y'=rep(1:nrow(DF), each=ncol(DF)),
'x'=as.logical(t(is.na(DF)))*rep(1:ncol(DF), nrow(DF)))
size <- 20 / log( prod(dim(DF)) ) # size of point depend on size of table
g <- ggplot(data=to.plot) + aes(x,y) +
geom_point(size=size, color="red", alpha=alpha) +
scale_y_reverse() + xlim(1,ncol(DF)) +
ggtitle("location of NAs in the data frame") +
xlab("columns") + ylab("lines")
pc <- round(sum(is.na(DF))/prod(dim(DF))*100, 2) # % NA
print(paste("percentage of NA data: ", pc))
return(g)
}
Which gives (using ggplot2 as graphical output):
ggNAadd(df, amount=0.20, plot=TRUE)
## [1] "percentage of NA data: 20"
## A B c
## 1 1 11 21
## 2 2 12 22
## 3 3 13 23
## 4 4 NA 24
## ..
Of course, as mentioned earlier, if you ask too many NAs the actual percentage will drop because of repetitions.
A mutate_all approach:
df %>%
dplyr::mutate_all(~ifelse(sample(c(TRUE, FALSE), size = length(.), replace = TRUE, prob = c(0.8, 0.2)),
as.character(.), NA))