I have a problem with conditional replacement. Let's assume I have the following code for a dataframe
a=c("0","1","0","B","NA","NA","NA","NA","NA")
b=c(0,1,0,0,1,0,1,0,1)
c=c(0,0,0,0,1,0,0,1,1)
d=c("0","1","0","0","1","0","B","NA","NA")
dat=data.frame(rbind(a,b,c,d))
names(dat)=c("P1","P2","P3","P4","C1","C2","C3","C4","C5")
Now I want to replace the row values of P1:P4 with NA if one of these values is B and I also want to replace the row values of C1:C5 with NA if one of these values is B. So I want the Dataframe to look like this:
a=c(**"NA","NA","NA","NA"**,"NA","NA","NA","NA","NA")
b=c(0,1,0,0,1,0,1,0,1)
c=c(0,0,0,0,1,0,0,1,1)
d=c("0","1","0","0",**"NA","NA","NA"**,"NA","NA")
dat=data.frame(rbind(a,b,c,d))
names(dat)=c("P1","P2","P3","P4","C1","C2","C3","C4","C5")
I hope the problem is understandable and I would appreciate any help.
Considering dat to be the original provided dataframe, I'm providing a comparatively lengthy code for better understanding. Hope it helps.
dat2 <- data.frame()
for(i in 1:nrow(dat)){
datSubset <- with(dat, dat[i,])
col.num.of.B <- which(datSubset == "B", arr.ind = T)[2]
if(is.na(col.num.of.B)){
datSubset <- datSubset
} else if(col.num.of.B < 5) {
datSubset[,c(1:4)] <- NA
} else {
datSubset[,c(5:9)] <- NA
}
dat2 <- rbind(dat2, datSubset)
}
dat2
# P1 P2 P3 P4 C1 C2 C3 C4 C5
# a <NA> <NA> <NA> <NA> NA NA NA NA NA
# b 0 1 0 0 1 0 1 0 1
# c 0 0 0 0 1 0 0 1 1
# d 0 1 0 0 <NA> <NA> <NA> <NA> <NA>
As I understood it... If the value B is found in columns P1 to P4, then set all the values within P1 to P4 to NA.
You can try:
nm <- c("P1", "P2", "P3", "P4")
cols <- which(names(dat) %in% nm)
dat[,cols][any(dat[,cols] == "B")] <- NA
dat
# P1 P2 P3 P4 C1 C2 C3 C4 C5
# a NA NA NA NA NA NA NA NA NA
# b NA NA NA NA 1 0 1 0 1
# c NA NA NA NA 1 0 0 1 1
# d NA NA NA NA 1 0 B NA NA
If you want to apply this to only the first row, then use dat[1,cols][any(dat[,cols] == "B")] <- NA.
Related
I have to find all columns with all NA-values. If there are not all NA-values in column, I have to replace NAs with 0.
My solution is:
NA_check <- colSums(is.na(frame)) == nrow(frame) #True or False - all NA or not
frame[is.na(frame) & which(names(frame) %in% names(NA_check)[which(NA_check == FALSE, arr.ind=T)])] <- 0
These conditions work separately, but they don't work together or I get some errors combining them. How can I solve my problem?
P.S. This modification also doesn't work if NA_checkis not all FALSE:
frame[is.na(frame[which(names(frame) %in% names(NA_check)[which(NA_check == FALSE, arr.ind=T)])])] <- 0
You can find out columns which has atleast one non-NA value (not all values are NA) and replace NA in that subset to 0.
not_all_NA <- colSums(!is.na(frame)) > 0
frame[not_all_NA][is.na(frame[not_all_NA])] <- 0
We can check this with an example :
frame <- data.frame(a = c(NA, NA, 3, 4), b = NA, c = c(NA, 1:3), d = NA)
frame
# a b c d
#1 NA NA NA NA
#2 NA NA 1 NA
#3 3 NA 2 NA
#4 4 NA 3 NA
not_all_NA <- colSums(!is.na(frame)) > 0
frame[not_all_NA][is.na(frame[not_all_NA])] <- 0
frame
# a b c d
#1 0 NA 0 NA
#2 0 NA 1 NA
#3 3 NA 2 NA
#4 4 NA 3 NA
We can also do this with dplyr :
library(dplyr)
frame %>% mutate(across(where(~any(!is.na(.))), tidyr::replace_na, 0))
I am trying to write a loop in R to perform some iteration on two datasets called datasetA and datasetB.
datasetA has 600 entries and datasetB has 200’000 entries.
For each entry in datasetA, I want to perform the following:
If the value of V2 in both datasets are equal,
then calculate the ppm:
(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000
If the ppm < |10|, then paste the ppm value in V4 column in datasetB, paste the relevant name of datasetA$V1 in column V1 of datasetB.
Say this is datasetA with 600 entries:
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
DatasetA
and this is an example of datasetB with 200000 entries:
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
DatasetB
The final table should look like this:
datasetC <- read.table(text='Alex 1 50.0001 6.459945
Alex 1 50.00032 2.059983
NA 2 70 NA
Mark 2 80 -1.25
Janine 3 88.0004 -3.14772
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
The final table should look like this
data<-datasetB
for(i in 1:5){
for(j in 1:8){
if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
data[j,1]=datasetA[i,1]
data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
}}}
data
Try this: I am a R noob but let me know if this works for you.
library(data.table)
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")
# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))
# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)
# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]
# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)
datasetB <- datasetC[datasetB]
# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]
The following answer seems to do what the question asks for but I am failing to get 2 of the computed values, final column V4.
AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
inx_a <- datasetA[['V2']] == v2
inx_b <- datasetB[['V2']] == v2
mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)
final
# V1 V2 V3 V4
#1 Alex 1 50.00010 6.399946
#2 Alex 1 50.00032 1.999983
#3 <NA> 2 70.00000 NA
#4 Mark 2 80.00000 -1.250002
#5 <NA> 2 102.00000 NA
#6 Janine 3 88.00040 -3.147723
#7 <NA> 3 100.00000 NA
#8 <NA> 3 101.00000 NA
If I understand correctly, the question is asking for a join with a complex condition. This can be implemented using data.table:
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
ppm <-(x.V3- i.V3) / i.V3 * 1E6
list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_),
V2,
V3 = i.V3,
V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
Here is an alternative approach which updates datasetB in place by an update join:
library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
, V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
I have many samples, each one of which has a corresponding abundance matrix. From these abundance matrices, I would like to create a large matrix that contains abundance information for each sample in rows.
For example, a single abundance matrix would look like:
A B C D
sample1 1 3 4 2
where A, B, C, and D represent colnames, and the abundances are the row values.
I would like to populate my larger matrix, which has as colnames all possible letters (A:Z) and all possible samples (sample1:sampleN) as rows, by matching the colname values.
For ex. :
A B C D E F G .... Z
sample1 1 3 4 2 NA NA NA ....
sample2 NA NA 2 5 7 NA NA ....
sample3 4 NA 6 9 2 NA 2 .....
....
sampleN
Different samples have a varying mix of abundances, in no guaranteed order.
When iteratively adding to this larger matrix, how could I ensure that the correct columns are populated by the right abundance values (ex. column "A" is only filled by values corresponding to abundances of "A" in different samples)? Thanks!
Starting data, changing just a little to highlight differences:
m1 <- as.matrix(read.table(header=TRUE, text="
A B C Z
sample1 1 3 4 2"))
m2 <- as.matrix(read.table(header=TRUE, text="
A B C D E F G
sample2 NA NA 2 5 7 NA NA
sample3 4 NA 6 9 2 NA 2"))
First, we need to make sure both matrices have the same column names:
newcols <- setdiff(colnames(m2), colnames(m1))
m1 <- cbind(m1, matrix(NA, nr=nrow(m1), nc=length(newcols), dimnames=list(NULL, newcols)))
newcols <- setdiff(colnames(m1), colnames(m2))
m2 <- cbind(m2, matrix(NA, nr=nrow(m2), nc=length(newcols), dimnames=list(NULL, newcols)))
m1
# A B C Z D E F G
# sample1 1 3 4 2 NA NA NA NA
m2
# A B C D E F G Z
# sample2 NA NA 2 5 7 NA NA NA
# sample3 4 NA 6 9 2 NA 2 NA
And now we combine them; regular cbind needs the column names to be aligned as well:
rbind(m2, m1[,colnames(m2),drop=FALSE])
# A B C D E F G Z
# sample2 NA NA 2 5 7 NA NA NA
# sample3 4 NA 6 9 2 NA 2 NA
# sample1 1 3 4 NA NA NA NA 2
You should be able to take advantage of matrix indexing, like so:
big[cbind(rownames(abun),colnames(abun))] <- abun
Using this example abundance matrix, and a big matrix to fill:
abun <- matrix(c(1,3,4,2),nrow=1,dimnames=list("sample1",LETTERS[1:4]))
big <- matrix(NA,nrow=5,ncol=26,dimnames=list(paste0("sample",1:5),LETTERS))
Another solution using reduce from purrr package and union_all from dplyr package:
library(purrr)
library(dplyr)
sample_names <- c("sample1","sample2","sample3")
Generating 3 random abundance dataframes:
num1 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df1 <- data.frame(t(num1))
colnames(df1) <- sample(LETTERS,length(num1))
num2 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df2 <- data.frame(t(num2))
colnames(df2) <- sample(LETTERS,length(num2))
num3 <- round(runif(runif(1,min = 1, max = 10),min = 1, max = 10))
df3 <- data.frame(t(num3))
colnames(df3) <- sample(LETTERS,length(num3))
This is actually the code that does all the magic:
A <- reduce(list(df1,df2,df3),union_all)
col_order <- sort(colnames(A),decreasing = FALSE)
A <- A[,col_order]
rownames(A) <- sample_names
Output:
> A
A C E F O P Q U W Y
sample1 9 NA NA NA 9 NA 5 6 NA NA
sample2 NA NA NA NA 5 4 NA NA 5 NA
sample3 NA 6 5 9 NA NA 3 NA 5 7
dat <- data.frame(A=c("name1", "name2", "name3"),
B=c(0,1,0), C=c(0,0,5), D= c(4,4,0), E=c(1,0,0), F=c(4,0,0) )
desiredresult <- data.frame(A=c("name1", "name2", "name3"),
B=c(NA,1,NA), C=c(NA,0,5), D= c(4,4,0), E=c(1,0,NA), F=c(4,NA,NA))
I want to replace 0 values with NA in every row until a positive value is encountered (no negative values in dataset). In addition to that I want to replace all values if their ending are all zeros leaving first 0 in place after last positive value. etc 5,0,0,0 -> 5,0,NA,NA
Provided example data with desiredresult. I was trying approach something like this, but there would need to be 5+ conditions to cover it all. Is there a better way to do this? Maybe with data.table?
dat$B[dat$B == 0 & (dat$C!=0 | dat$D!=0)] <- NA
dat$C[dat$C == 0 & dat$D!=0 & is.na(dat$B)] <- NA
Using the data.table-package, you could approach this as follows:
cols <- names(dat)[2:6]
library(data.table)
setDT(dat)[, (cols) := {x <- unlist(.SD);
x[cumsum(x)==0] <- NA;
l <- c(tail(cumsum(rev(x)),-1),1) == 0;
x[rev(l)] <- NA;
names(x) <- cols;
as.list(x)},
by = A]
you get:
> dat
A B C D E F
1: name1 NA NA 4 1 4
2: name2 1 0 4 0 NA
3: name3 NA 5 0 NA NA
The same kind of thinking, but then with base R:
dl <- as.data.frame(t(dat[,-1]))
idx1 <- cumsum(dl) == 0
idx2 <- sapply(dl, function(x) {
l <- c(tail(cumsum(rev(x)),-1),1) == 0
l[is.na(l)] <- FALSE
rev(l)
})
dl[idx1 | idx2] <- NA
dat[,-1] <- t(dl)
which will get you the same result:
> dat
A B C D E F
1 name1 NA NA 4 1 4
2 name2 1 0 0 4 0
3 name3 NA 5 0 NA NA
New example data:
dat <- data.frame(A=c("name1", "name2", "name3"),
B=c(0,1,0), C=c(0,0,5), D=c(4,0,0), E=c(1,4,0), F=c(4,0,0) )
This should work:
#Apply the first rule: convert 0 to NA until we find a non negative
res1<-t(apply(dat[,-1], 1, function(x) {
xc <- cumsum(x) #cumulative sum
x[xc==0]<-NA #NA where cumulative sum iz 0
x
}))
# Apply the second rule
res2<-t(apply(res1, 1, function(x) {
xc <- cumsum(rev(x)) #reverse the sum
xc<-c(tail(xc,-1),1) # shift the sum
res<-rev(x) #reverse the vector
res[xc==0]<-NA
rev(res)
}))
#Reconstruct the data frame
cbind(data.frame(name=dat[,1]),res2)
# name B C D E F
#1 name1 NA NA 4 1 4
#2 name2 1 0 4 0 NA
#3 name3 NA 5 0 NA NA
I have a table (mydf) as shown below. I would like to use this for loop (my code) in R which works for only one column (for ALT1 column in this instance) to loop over all the columns containing ALT1 through ALTn and store the output in separate variables from final1 through finaln.
The purpose here is to loop over ALT1 through ALTn to match the nucleotide columns (A,C,G,T,N) and get the corresponding values as shown in the result below.Thank you for your help!
mycode
final1 <- {}
i <- 1
result =merge(coverage.bam, rows.concat.alt, by="start")
for(i in 1:nrow(result)){
final1[i] = paste(paste(result$chr[i], result$start[i], result$end[i],sep=":"),"-",
result$REF[i],"(",result[,(as.character(result$REF[i]))][i],")",",", result$ALT1[i],
"(",result[,(as.character(result$ALT1[i]))][i][!is.na(result[,(as.character(result$ALT1[i]))][i])],")",sep="")
}
final1
I have tried to expand this code for ALT through ALTn, but it does not work, could you help me solve this please?
final <- list()
setValue<-function(element){
print(element)
for(i in 1:nrow(result)){
final[[i]] = paste(paste(result$chr[i], result$start[i], result$end[i],sep=":"),"-",
result$REF[i],"(",result[,(as.character(result$REF[i]))][i],")",",", result[,element][i],
"(",result[,(as.character(result[,element][i])))][i][!is.na(result[,(as.character(result[,element][i])][i])],")",sep="")
}
}
for(i in colnames(result)){
if(grepl('ALT', i)){
setValue(i)
}
}
mydf
chr start end A C G T N = - REF ALT ALT1 ALT2 ALT3 ALTn
1 chr10 102022031 102022031 NA 34 NA NA NA NA NA C G G NA NA NA
2 chr10 102220574 102220574 2 22 2 3 NA NA NA C AGT A G T NA
3 chr10 115322228 115322228 NA 25 NA NA NA NA NA C A A NA NA NA
4 chr10 122222925 122222925 30 NA NA NA NA NA NA A C C NA NA NA
5 chr10 121111042 121111042 NA 48 NA NA NA NA NA C T T NA NA NA
6 chr10 124444484 124444484 NA 60 NA NA NA NA NA C T T NA NA NA
Result
"chr10:102022031:102022031-C(34),G()" "chr10:102220574:102220574-C(22),A(2),G(2),T(3)" "chr10:115322228:115322228-C(25),A()"
[4] "chr10:122222925:122222925-A(30),C()" "chr10:121111042:121111042-C(48),T()" "chr10:124444484:124444484-C(60),T()"
Try
p1 <- do.call(paste,c(mydf[1:3], sep=":"))
p2 <- apply(mydf[c(4:8, 11:16)], 1, function(x) {
Un1 <- unique(match( x[7:11], names(x)[1:4], nomatch=0))
i1 <- match(x[6], names(x))
v1 <- paste0(names(x[i1]),'(', x[i1], ')')
v2 <- as.numeric(x[Un1])
v2[is.na(v2)] <- ''
v3 <-paste(names(x[Un1]), '(', v2, ')', sep='', collapse=",")
paste(v1, v3, sep=",") })
paste(p1, p2, sep="-")
#[1] "chr10:102022031:102022031-C(34),G()"
#[2] "chr10:102220574:102220574-C(22),A(2),G(2),T(3)"
#[3] "chr10:115322228:115322228-C(25),A()"
#[4] "chr10:122222925:122222925-A(30),C()"
#[5] "chr10:121111042:121111042-C(48),T()"
#[6] "chr10:124444484:124444484-C(60),T()"