How to populate a data frame based on condition in R - r

I created a empty data frame something like this
id Alyr Crub Lala Brap Bole Spar Esal Aara Thas
1 XLOC_003940_TBH_1 NA NA NA NA NA NA NA NA NA
I wanted to see if id and column name match then it should replace "NA" with certain value. Here is an example:
ex1 <- "Alyr_XLOC_003940_TBH_1_Ortholog_Known_Gene_Sense"
sp <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\1", ex1)
gene <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\2", ex1)
fun <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\3", ex1)
Based on the above example, i wanted to get something like this
id Alyr Crub Lala Brap Bole Spar Esal Aara Thas
1 XLOC_003940_TBH_1 Ortholog_Known_Gene_Sense NF NF NF NF NF NF NF NF
I am stuck here and can't figure how can i do this?

Use matrix subsetting:
df1$id <- gene
df1[cbind(1:nrow(df1), match(sp, names(df1)))] <- fun
Check this answer for more on subsetting a data frame by a two-column matrix.
##Example
nms <- scan(what="character", text="id Alyr Crub Lala Brap Bole Spar Esal Aara Thas")
df1 <- as.data.frame(matrix(NA, 3, 10))
names(df1) <- nms
df1
# id Alyr Crub Lala Brap Bole Spar Esal Aara Thas
#1 NA NA NA NA NA NA NA NA NA NA
#2 NA NA NA NA NA NA NA NA NA NA
#3 NA NA NA NA NA NA NA NA NA NA
ex1 <- c("Alyr_XLOC_003940_TBH_1_Ortholog_Gene",
"Lala_XLOC_1234_TBH_1_Lalala_Gene",
"Thas_XLOC_5678_TBH_1_Thasthas_Gene")
sp <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\1", ex1)
gene <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\2", ex1)
fun <- sub("([A-Za-z]+)_(XLOC_\\d+_TBH_1)_([A-Za-z_]+)","\\3", ex1)
df1$id <- gene
df1[cbind(1:nrow(df1), match(sp, names(df1)))] <- fun
df1
# id Alyr Crub Lala Brap Bole Spar Esal Aara Thas
# 1 XLOC_003940_TBH_1 Ortholog_Gene NA <NA> NA NA NA NA NA <NA>
# 2 XLOC_1234_TBH_1 <NA> NA Lalala_Gene NA NA NA NA NA <NA>
# 3 XLOC_5678_TBH_1 <NA> NA <NA> NA NA NA NA NA Thasthas_Gene

Related

conditional sequence across rows in a new column

If there's a "6" in df$a, I'd like 1:9 from the previous September to next May, to be in a new column, shown here as df$b, with NA as the rest.
library(tidyverse)
library(lubridate)
date <- c("2/29/1940","3/31/1940","4/30/1940","5/31/1940","6/30/1940","7/31/1940","8/31/1940","9/30/1940","10/31/1940","11/30/1940","12/31/1940","1/31/1941","2/28/1941",
"3/31/1941","4/30/1941","5/31/1941","6/30/1941","7/31/1941","8/31/1941","9/30/1941","10/31/1941","11/30/1941", "12/31/1941","1/31/1942","2/28/1942","3/31/1942",
"4/30/1942","5/31/1942", "6/30/1942","7/31/1942","8/31/1942","9/30/1942","10/31/1942","11/30/1942","12/31/1942","1/31/1943","2/28/1943","3/31/1943","4/30/1943",
"5/31/1943","6/30/1943","7/31/1943", "8/31/1943","9/30/1943")
a <- c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA",6,"NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA",
"NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")
df <- data.frame(date, a)
df %<>% mutate(date = mdy(date), a)
df:
date a b
2/29/1940 NA NA
3/31/1940 NA NA
4/30/1940 NA NA
5/31/1940 NA NA
6/30/1940 NA NA
7/31/1940 NA NA
8/31/1940 NA NA
9/30/1940 NA 1
10/31/1940 NA 2
11/30/1940 NA 3
12/31/1940 NA 4
1/31/1941 NA 5
2/28/1941 6 6
3/31/1941 NA 7
4/30/1941 NA 8
5/31/1941 NA 9
6/30/1941 NA NA
7/31/1941 NA NA
8/31/1941 NA NA
9/30/1941 NA NA
10/31/1941 NA NA
11/30/1941 NA NA
12/31/1941 NA NA
1/31/1942 NA NA
2/28/1942 NA NA
3/31/1942 NA NA
4/30/1942 NA NA
5/31/1942 NA NA
6/30/1942 NA NA
7/31/1942 NA NA
8/31/1942 NA NA
9/30/1942 NA NA
10/31/1942 NA NA
11/30/1942 NA NA
12/31/1942 NA NA
1/31/1943 NA NA
2/28/1943 NA NA
3/31/1943 NA NA
4/30/1943 NA NA
5/31/1943 NA NA
6/30/1943 NA NA
7/31/1943 NA NA
8/31/1943 NA NA
9/30/1943 NA NA
For more context, I have a hundred years or so of monthly data in a data frame and I'm looking for an efficient way to produce the third column given the first two columns, to process/visualize other data not shown. Only sometimes there is a 6 for February in df$a. When so, I'd like the previous September through the next May to be populated as shown in a new column (I'm looking to produce df$b). I tried some clumsy ways, mostly by a bunch of lines with variations of mutate() , lag() , and lead() but have a good feeling there's more direct routes.
thank you,
dave
A solution using case_when, lead, and lag from dplyr. It is not the most concise solution, but it will work when 6 is close to the edge.
library(tidyverse)
df2 <- df %>%
mutate(b = case_when(
lead(a, n = 5L) == 6 ~1,
lead(a, n = 4L) == 6 ~2,
lead(a, n = 3L) == 6 ~3,
lead(a, n = 2L) == 6 ~4,
lead(a, n = 1L) == 6 ~5,
a == 6 ~6,
lag(a, n = 1L) == 6 ~7,
lag(a, n = 2L) == 6 ~8,
lag(a, n = 3L) == 6 ~9,
TRUE ~NA_real_
))
DATA
Notice that I changed the way you specified NA in the column A.
library(lubridate)
date <- c("2/29/1940","3/31/1940","4/30/1940","5/31/1940","6/30/1940","7/31/1940","8/31/1940","9/30/1940","10/31/1940","11/30/1940","12/31/1940","1/31/1941","2/28/1941",
"3/31/1941","4/30/1941","5/31/1941","6/30/1941","7/31/1941","8/31/1941","9/30/1941","10/31/1941","11/30/1941", "12/31/1941","1/31/1942","2/28/1942","3/31/1942",
"4/30/1942","5/31/1942", "6/30/1942","7/31/1942","8/31/1942","9/30/1942","10/31/1942","11/30/1942","12/31/1942","1/31/1943","2/28/1943","3/31/1943","4/30/1943",
"5/31/1943","6/30/1943","7/31/1943", "8/31/1943","9/30/1943")
a <- c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA , 6, NA , NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA , NA , NA , NA , NA , NA , NA,
NA, NA, NA , NA , NA , NA , NA , NA , NA , NA , NA)
df <- data.frame(date, a)
df %<>% mutate(date = mdy(date), a)

Combining datasets

I have 15 datasets. The 1st column is "subject" and is identical in all sets. The number of the rest of the columns is not the same in all datasets. I need to combine all of this data in a single dataframe. I found the command "Reduce" but I am just starting with R and I couldn't understand if this is what I need and if so, what is the syntax? Thanks!
I suggest including a reproducible example in the future so that others can see the format of data you're working with and what you're trying to do.
Here is some randomly generated example data, each with the "Subject" column:
list_of_dfs <- list(
df1 = data.frame(Subject = 1:4, a = rnorm(4), b = rnorm(4)),
df2 = data.frame(Subject = 5:8, c = rnorm(4), d = rnorm(4), e = rnorm(4)),
df3 = data.frame(Subject = 7:10, f = rnorm(4)),
df4 = data.frame(Subject = 2:5, g = rnorm(4), h = rnorm(4))
)
Reduce with merge is a good choice:
combined_df <- Reduce(
function(x, y) { merge(x, y, by = "Subject", all = TRUE) },
list_of_dfs
)
And the output:
> combined_dfs
Subject a b c d e f g h
1 1 1.1106594 1.2530046 NA NA NA NA NA NA
2 2 -1.0275630 0.6437101 NA NA NA NA -1.9393347 -0.4361952
3 3 0.1558639 1.2792212 NA NA NA NA -0.8861966 1.0137530
4 4 0.4283585 -0.1045530 NA NA NA NA 1.8924896 -0.3788198
5 5 NA NA 0.08261190 0.77058804 -1.165042 NA 0.7950784 -1.3467386
6 6 NA NA 2.51214598 0.62024328 1.496520 NA NA NA
7 7 NA NA 0.01581309 -0.04777196 -1.327884 1.5111734 NA NA
8 8 NA NA 0.80448136 -0.33347573 -2.290428 -0.3863564 NA NA
9 9 NA NA NA NA NA -1.2371795 NA NA
10 10 NA NA NA NA NA 1.6819063 NA NA

R: Is there a maximum number of columns allowed in a list

I tried to expand a table with new values but it failed at the 100 somethings' column. So I did a test with the following code. Only the first 100 columns are added. Is this R's limit or can I bypass it somehow?
x <- list()
x$c1 <- NA
x$c2 <- NA
x$c3 <- NA
x$c4 <- NA
x$c5 <- NA
x$c6 <- NA
x$c7 <- NA
x$c8 <- NA
x$c9 <- NA
x$c10 <- NA
x$c11 <- NA
x$c12 <- NA
x$c13 <- NA
x$c14 <- NA
x$c15 <- NA
x$c16 <- NA
x$c17 <- NA
x$c18 <- NA
x$c19 <- NA
x$c20 <- NA
x$c21 <- NA
x$c22 <- NA
x$c23 <- NA
x$c24 <- NA
x$c25 <- NA
x$c26 <- NA
x$c27 <- NA
x$c28 <- NA
x$c29 <- NA
x$c30 <- NA
x$c31 <- NA
x$c32 <- NA
x$c33 <- NA
x$c34 <- NA
x$c35 <- NA
x$c36 <- NA
x$c37 <- NA
x$c38 <- NA
x$c39 <- NA
x$c40 <- NA
x$c41 <- NA
x$c42 <- NA
x$c43 <- NA
x$c44 <- NA
x$c45 <- NA
x$c46 <- NA
x$c47 <- NA
x$c48 <- NA
x$c49 <- NA
x$c50 <- NA
x$c51 <- NA
x$c52 <- NA
x$c53 <- NA
x$c54 <- NA
x$c55 <- NA
x$c56 <- NA
x$c57 <- NA
x$c58 <- NA
x$c59 <- NA
x$c60 <- NA
x$c61 <- NA
x$c62 <- NA
x$c63 <- NA
x$c64 <- NA
x$c65 <- NA
x$c66 <- NA
x$c67 <- NA
x$c68 <- NA
x$c69 <- NA
x$c70 <- NA
x$c71 <- NA
x$c72 <- NA
x$c73 <- NA
x$c74 <- NA
x$c75 <- NA
x$c76 <- NA
x$c77 <- NA
x$c78 <- NA
x$c79 <- NA
x$c80 <- NA
x$c81 <- NA
x$c82 <- NA
x$c83 <- NA
x$c84 <- NA
x$c85 <- NA
x$c86 <- NA
x$c87 <- NA
x$c88 <- NA
x$c89 <- NA
x$c90 <- NA
x$c91 <- NA
x$c92 <- NA
x$c93 <- NA
x$c94 <- NA
x$c95 <- NA
x$c96 <- NA
x$c97 <- NA
x$c98 <- NA
x$c99 <- NA
x$c100 <- NA
x$c101 <- NA
x$c102 <- NA
x$c103 <- NA
x$c104 <- NA
x$c105 <- NA
x$c106 <- NA
x$c107 <- NA
x$c108 <- NA
x$c109 <- NA
x$c110 <- NA
x$c111 <- NA
x$c112 <- NA
x$c113 <- NA
x$c114 <- NA
x$c115 <- NA
x$c116 <- NA
x$c117 <- NA
x$c118 <- NA
x$c119 <- NA
x$c120 <- NA
x$c121 <- NA
x$c122 <- NA
x$c123 <- NA
x$c124 <- NA
x$c125 <- NA
x$c126 <- NA
x$c127 <- NA
x$c128 <- NA
x$c129 <- NA
x$c130 <- NA
x$c131 <- NA
x$c132 <- NA
x$c133 <- NA
x$c134 <- NA
x$c135 <- NA
x$c136 <- NA
x$c137 <- NA
x$c138 <- NA
x$c139 <- NA
x$c140 <- NA
x$c141 <- NA
x$c142 <- NA
x$c143 <- NA
x$c144 <- NA
x$c145 <- NA
x$c146 <- NA
x$c147 <- NA
x$c148 <- NA
x$c149 <- NA
x$c150 <- NA
View(x)
There is no such limit. Try names(x) and you'll see all the columns are there. Another good function for inspecting objects is str(x) but this output is truncated for such a large object. I don't use View() but it either has a limit (not apparent from the help page) or it's just clunky.

Conditional replacement of specific row values

I have a problem with conditional replacement. Let's assume I have the following code for a dataframe
a=c("0","1","0","B","NA","NA","NA","NA","NA")
b=c(0,1,0,0,1,0,1,0,1)
c=c(0,0,0,0,1,0,0,1,1)
d=c("0","1","0","0","1","0","B","NA","NA")
dat=data.frame(rbind(a,b,c,d))
names(dat)=c("P1","P2","P3","P4","C1","C2","C3","C4","C5")
Now I want to replace the row values of P1:P4 with NA if one of these values is B and I also want to replace the row values of C1:C5 with NA if one of these values is B. So I want the Dataframe to look like this:
a=c(**"NA","NA","NA","NA"**,"NA","NA","NA","NA","NA")
b=c(0,1,0,0,1,0,1,0,1)
c=c(0,0,0,0,1,0,0,1,1)
d=c("0","1","0","0",**"NA","NA","NA"**,"NA","NA")
dat=data.frame(rbind(a,b,c,d))
names(dat)=c("P1","P2","P3","P4","C1","C2","C3","C4","C5")
I hope the problem is understandable and I would appreciate any help.
Considering dat to be the original provided dataframe, I'm providing a comparatively lengthy code for better understanding. Hope it helps.
dat2 <- data.frame()
for(i in 1:nrow(dat)){
datSubset <- with(dat, dat[i,])
col.num.of.B <- which(datSubset == "B", arr.ind = T)[2]
if(is.na(col.num.of.B)){
datSubset <- datSubset
} else if(col.num.of.B < 5) {
datSubset[,c(1:4)] <- NA
} else {
datSubset[,c(5:9)] <- NA
}
dat2 <- rbind(dat2, datSubset)
}
dat2
# P1 P2 P3 P4 C1 C2 C3 C4 C5
# a <NA> <NA> <NA> <NA> NA NA NA NA NA
# b 0 1 0 0 1 0 1 0 1
# c 0 0 0 0 1 0 0 1 1
# d 0 1 0 0 <NA> <NA> <NA> <NA> <NA>
As I understood it... If the value B is found in columns P1 to P4, then set all the values within P1 to P4 to NA.
You can try:
nm <- c("P1", "P2", "P3", "P4")
cols <- which(names(dat) %in% nm)
dat[,cols][any(dat[,cols] == "B")] <- NA
dat
# P1 P2 P3 P4 C1 C2 C3 C4 C5
# a NA NA NA NA NA NA NA NA NA
# b NA NA NA NA 1 0 1 0 1
# c NA NA NA NA 1 0 0 1 1
# d NA NA NA NA 1 0 B NA NA
If you want to apply this to only the first row, then use dat[1,cols][any(dat[,cols] == "B")] <- NA.

Expand loop over multiple columns in R

I have a table (mydf) as shown below. I would like to use this for loop (my code) in R which works for only one column (for ALT1 column in this instance) to loop over all the columns containing ALT1 through ALTn and store the output in separate variables from final1 through finaln.
The purpose here is to loop over ALT1 through ALTn to match the nucleotide columns (A,C,G,T,N) and get the corresponding values as shown in the result below.Thank you for your help!
mycode
final1 <- {}
i <- 1
result =merge(coverage.bam, rows.concat.alt, by="start")
for(i in 1:nrow(result)){
final1[i] = paste(paste(result$chr[i], result$start[i], result$end[i],sep=":"),"-",
result$REF[i],"(",result[,(as.character(result$REF[i]))][i],")",",", result$ALT1[i],
"(",result[,(as.character(result$ALT1[i]))][i][!is.na(result[,(as.character(result$ALT1[i]))][i])],")",sep="")
}
final1
I have tried to expand this code for ALT through ALTn, but it does not work, could you help me solve this please?
final <- list()
setValue<-function(element){
print(element)
for(i in 1:nrow(result)){
final[[i]] = paste(paste(result$chr[i], result$start[i], result$end[i],sep=":"),"-",
result$REF[i],"(",result[,(as.character(result$REF[i]))][i],")",",", result[,element][i],
"(",result[,(as.character(result[,element][i])))][i][!is.na(result[,(as.character(result[,element][i])][i])],")",sep="")
}
}
for(i in colnames(result)){
if(grepl('ALT', i)){
setValue(i)
}
}
mydf
chr start end A C G T N = - REF ALT ALT1 ALT2 ALT3 ALTn
1 chr10 102022031 102022031 NA 34 NA NA NA NA NA C G G NA NA NA
2 chr10 102220574 102220574 2 22 2 3 NA NA NA C AGT A G T NA
3 chr10 115322228 115322228 NA 25 NA NA NA NA NA C A A NA NA NA
4 chr10 122222925 122222925 30 NA NA NA NA NA NA A C C NA NA NA
5 chr10 121111042 121111042 NA 48 NA NA NA NA NA C T T NA NA NA
6 chr10 124444484 124444484 NA 60 NA NA NA NA NA C T T NA NA NA
Result
"chr10:102022031:102022031-C(34),G()" "chr10:102220574:102220574-C(22),A(2),G(2),T(3)" "chr10:115322228:115322228-C(25),A()"
[4] "chr10:122222925:122222925-A(30),C()" "chr10:121111042:121111042-C(48),T()" "chr10:124444484:124444484-C(60),T()"
Try
p1 <- do.call(paste,c(mydf[1:3], sep=":"))
p2 <- apply(mydf[c(4:8, 11:16)], 1, function(x) {
Un1 <- unique(match( x[7:11], names(x)[1:4], nomatch=0))
i1 <- match(x[6], names(x))
v1 <- paste0(names(x[i1]),'(', x[i1], ')')
v2 <- as.numeric(x[Un1])
v2[is.na(v2)] <- ''
v3 <-paste(names(x[Un1]), '(', v2, ')', sep='', collapse=",")
paste(v1, v3, sep=",") })
paste(p1, p2, sep="-")
#[1] "chr10:102022031:102022031-C(34),G()"
#[2] "chr10:102220574:102220574-C(22),A(2),G(2),T(3)"
#[3] "chr10:115322228:115322228-C(25),A()"
#[4] "chr10:122222925:122222925-A(30),C()"
#[5] "chr10:121111042:121111042-C(48),T()"
#[6] "chr10:124444484:124444484-C(60),T()"

Resources