Iteration over data.table - r

I have a table A with column names: Var1, Var2, Var3.
Var1 = c("N1", "N2", "0", "0", "N3", "N4", "0", "0")
Var2 = c("0", "A", "0", "0", "0", "B", "0", "0")
Var3 = c("0", "Yes", "No", "All", "0", "x", "y", "z")
I would like to obtain vectors based on Table A, which contains values from column eg: N2 = (Yes, No, All), N4 = (x, y, z).
I have tried few iterations with "for loop" and "logical if" but with no success. Please, give me some hint.

With data.table:
replace the 0s in Var1 by NA
carry forward last occurence of non NA values (using for example zoo::na.locf because data.table::nafill doesnt yet work for characters).
filter according to Var1:
library(data.table)
data <- data.table(Var1 = c("N1", "N2", "0", "0", "N3", "N4", "0", "0"), Var2 = c("0", "A", "0", "0", "0", "B", "0", "0"), Var3 = c("0", "Yes", "No", "All", "0", "x", "y", "z"))
# replace 0s by NA for next step
data[Var1==0,Var1:=NA]
# last occurence carried forward
data[,Var1:=zoo::na.locf(Var1)]
data[Var1=='N2',Var3]
#[1] "Yes" "No" "All"
data[Var1=='N4',Var3]
#[1] "x" "y" "z"

Related

Extract specific columns based on target list ,

I have a data set comprising 1000 rows and 5000 columns, and I want to select specific columns ("mydata"). The target columns are listed in the data frame "cols".
I've tried some of the examples posted here, but no success! Could anyone please suggest it to me?
cols[1:5,]
c("S100024810", "S100024905", "S100024920", "S100024923", "S100025437"
)
mydata[1:3,1:5]
structure(list(S100024732 = c("1", "0", "1"), S100024733 = c("-",
"0", "0"), S100024803 = c("0", "0", "-"), S100024810 = c("1",
"1", "1"), S100024817 = c("-", "1", "-")), row.names = c(NA,
3L), class = "data.frame")
mydata[,cols[cols %in% names(mydata)]]
Error in .subset(x, j) : invalid subscript type 'list'
cols appears to be a data frame with one column, so you need to reference for that column.
cols[cols[, 1] %in% names(mydata), 1]
# [1] "S100024810"
To prevent the result to be coerced to a vector if it's just one column (as it happens with your cols data frame), we need to do drop=FALSE.
mydata[, cols[cols[, 1] %in% names(mydata), 1], drop=FALSE]
# S100024810
# 1 1
# 2 1
# 3 1
Data:
cols <- structure(list(V1 = c("S100024810", "S100024905", "S100024920",
"S100024923", "S100025437")), class = "data.frame", row.names = c(NA,
-5L))
mydata <- structure(list(S100024732 = c("1", "0", "1"), S100024733 = c("-",
"0", "0"), S100024803 = c("0", "0", "-"), S100024810 = c("1",
"1", "1"), S100024817 = c("-", "1", "-")), row.names = c(NA,
3L), class = "data.frame")

Plotting multiple binary variables on the same plot in ggplot

I am hoping to use ggplot to construct a barplot of frequencies (or just % 1s) of a bunch of binary variables, and am having trouble getting them all together on one plot.
The variables all stem from the same question in a survey, so ideally it'd be nice to have data that is tidy with one column for this variable, but respondents could select more than one option and I'm hoping to retain that instead of having a "more than one selected" option. Here is a slice of the data:
structure(list(gender = structure(c("Male", "Male", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male"), label = "Q4", format.stata = "%24s"),
var1 = structure(c("0", "0", "1", "1", "0", "0", "0", "0",
"0", "0"), format.stata = "%9s"), var2 = structure(c("0",
"98", "1", "0", "0", "0", "0", "0", "0", "0"), format.stata = "%9s"),
var3 = structure(c("0", "0", "0", "0", "0", "0", "0", "0",
"0", "0"), format.stata = "%9s"), var4 = structure(c("1",
"0", "1", "0", "0", "0", "1", "1", "0", "0"), format.stata = "%9s"),
var5 = structure(c("1", "0", "0", "0", "0", "1", "0", "0",
"0", "0"), format.stata = "%9s")), row.names = c(NA, -10L
), class = c("tbl_df", "tbl", "data.frame"))
Get the data in long format so that it is easier to plot.
library(tidyverse)
df %>%
pivot_longer(cols = starts_with('var')) %>%
group_by(name) %>%
summarise(frequency_of_1 = sum(value == 1)) %>%
#If you need percentage use mean instead of sum
#summarise(frequency_of_1 = mean(value == 1)) %>%
ggplot() + aes(name, frequency_of_1) + geom_col()
In base R you can do this with colSums and barplot.
barplot(colSums(df[-1] == 1))
#For percentage
#barplot(colMeans(df[-1] == 1))

R for loop on objects in global environment to subset on one variable

Say I have 3 objects (A, B and C) in my global environment:
A <- data.frame(
"Var1" = c("0", "0", "1", "0"),
"Var2" = c("1", "0", "0", "0"),
"Var3" = c("0","1", "1", "1"),
"Site" = c("alpha", "alpha", "beta" ,"gamma")
)
B <- data.frame(
"Var4" = c("0", "0", "1", "1"),
"Var5" = c("1", "0", "0", "1"),
"Site" = c("alpha", "beta" , "beta" ,"gamma")
)
C <- data.frame(
"Var6" = c("0", "0", "1"),
"Var7" = c("1", "0", "0"),
"Site" = c("alpha", "beta" ,"gamma")
)
I would like to create a loop through all the objects in my global environment and subset on the "Site" variable and create and naming the following datasets as:
A_alpha = A[A$Site == 'alpha',]
A_beta = A[A$Site == 'beta',]
A_gamma = A[A$Site == 'gamma',]
B_alpha = B[B$Site == 'alpha',]
B_beta = B[B$Site == 'beta',]
B_gamma = B[B$Site == 'gamma',]
C_alpha = C[C$Site == 'alpha',]
C_beta = C[C$Site == 'beta',]
C_gamma = C[C$Site == 'gamma',]
How would the loop look like?

set correct column names when output list items to csv in r

Given a list (e.g. out), I want to write each item out to a separate csv file for use elsewhere.
The list is large and contains a lot of items so I wanted to shortcut using a for loop. I created the following to
build a name for each output file based on the data group and date. When I run it everything works except it renames the columns
using the list item name and the existing colnames (e.g. instead of 'week4' I get 'pygweek4'. I do not want it to change my column names.
I tried setting col.names = TRUE, hoping to retain the existing names, and using the code below to specify the names,
as well as setting col.names = FALSE. In all cases I get a warning message saying that "attempt to set 'col.names' ignored".
Can anyone suggest a simple method of retaining the column names I already have?
out <- list(pyg = structure(list(week4 = c("0", "1", "1", "0", "1"),
week5 = c("0", "1", "1", "1", "1"), week6 = c("0", "1", "0", "1", "1"),
week7 = c("0", "0", "0", "1", "1"), week8 = c("0", "1", "0", "1", "1")),
row.names = 281:285, class = "data.frame"),
saw = structure(list(week4 = c("0", "0", "0", "0", "0"),
week5 = c("0", "0", "0", "0", "0"), week6 = c("0", "0", "0", "0", "0"),
week7 = c("0", "0", "0", "0", "0"), week8 = c("0", "0", "0", "0", "1")),
row.names = c(NA, 5L), class = "data.frame"))
for(i in 1:length(out)){
n = paste(paste(names(out)[i],Sys.Date(), sep = "_"), ".csv", sep = "") # create set name and version control
write.csv(out[i], file = n, row.names = FALSE, col.names = c("week4", "week5", "week6", "week7", "week8"))
}
Sorry for the lack of decent tags... I don't have the reputation to set tags that I think are useful for this post and couldn't find ones that made sense in the ones available.
We don't need to specify the col.names. The issue seems to be that, the list elements are not extracted correctly. It should be [[i]] instead of [i]. With [i], it is still a list of one data.frame element. By doing [[i]], it extracts the data.frame from the list
for(i in seq_along(out)){
n <- paste(paste(names(out)[i],Sys.Date(), sep = "_"),
".csv", sep = "")
write.csv(out[[i]], file = n, row.names = FALSE, quote = FALSE)
}
The difference can be found from checking the str
str(out[[1]])
str(out[1])

R code challenge: retrieving the values in matching columns and sum them up with matching rows

I have a problem solving this in R. I have this data frame called testa (dput included). I need to match all the letters in column ALT with the colnames (A,C,G,T,N) and get the corresponding values in those column along with the value for REF letters and get the result ad.new (my code does this job).
However, I need to expand this code to solve an issue with the line where the TYPE column has flat at the end. For the row with the flat, I need to match its start id (chr10:102053031) with other ids in start column. If they match, I need to sum up the corresponding value for ALT from A,C,G,T,N column and replace it with ad.new column for the flat line along with the REF value.
If you run the dput and my code you will be able to understand it. So basically, I want to match the letters in REF and ALT columns and get the corresponding values from the columns (A,C,G,T,N) and separate those values by comma for REF and ALT. However (in this example), for flat line I want to sum up the value in column A with matching start id with the start id of flat line (the value in this case is 6) and the value with another match (the value in this case is 7 from G column) and sum them together to give 13. So for flat line my result should be 0,13.
The expected result is also shown below.
my incomplete code:
testa[is.na(testa)]<-0
ref.counts<-testa[,testa[,"REF"]]
ref.counts<-as.matrix(Ref.counts)
ref.counts[is.na(Ref.counts)]<-0
ref.counts<-diag(Ref.counts)
alt.counts<-testa[,testa[,"ALT"]]
alt.counts<-as.matrix(alt.counts)
alt.counts[is.na(alt.counts)]<-0
alt.counts<-diag(alt.counts)
#############
##need to extend this code here
#############
ad.new<-paste(Ref.counts,alt.counts,sep=",")
dput for testa:
structure(c("chr10:101544447", "chr10:102053031", "chr10:102778767",
"chr10:102789831", "chr10:102989480", "chr10:102053031", "chr10:102053031",
"0", "6", "0", "0", "0", "0", "0", "0", "34", "24", "0", "0",
"34", "34", "0", "0", "0", "0", "0", "0", "7", "53", "0", "0",
"30", "12", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"chr10", "chr10", "chr10", "chr10", "chr10", "chr10", "chr10",
"101544447", "102053031", "102778767", "102789831", "102989480",
"102053031", "102053031", "A", "C", "C", "C", "C", "C", "C",
"T", "A", "T", "T", "T", "G", "G", "snp", "snp", "snp", "snp",
"snp", "snp:102053031:flat", "snp", "nonsynonymous SNV",
"intronic", "nonsynonymous SNV", "nonsynonymous SNV", "ncRNA_exonic",
"intronic", "intronic", "ABCC2:NM_000392:exon2:c.A116T:p.Y39F,",
"PKD2L1", "PDZD7:NM_024895:exon8:c.G1136A:p.R379Q,PDZD7:NM_001195263:exon8:c.G1136A:p.R379Q,",
"PDZD7:NM_024895:exon2:c.G146A:p.R49Q,PDZD7:NM_001195263:exon2:c.G146A:p.R49Q,",
"LBX1-AS1", "PKD2L1", "PKD2L1"), .Dim = c(7L, 15L), .Dimnames = list(
c("1", "2", "3", "4", "5", "6", "7"), c("start", "A", "C",
"G", "T", "N", "=", "-", "chr", "end", "REF", "ALT", "TYPE",
"refGene::location", "refGene::type")))
Expected result
ad.new
"0,53"
"34,6"
"24,0"
"0,30"
"0,12"
"0,13"
"34,7"
Something like this should work :
# apply the "normal" rule (non considering flat exceptions)
alts <- as.numeric(diag(testa[,testa[,"ALT"]]))
refs <- as.numeric(diag(testa[,testa[,"REF"]]))
res <- paste(refs,alts,sep=",")
# replace lines having TYPE ending with "flat"
flats <- grep('.*flat$',testa[,"TYPE"])
res[flats] <-
unlist(lapply(flats,function(x){
startId <- testa[x,"start"]
selection <- setdiff(which(testa[,"start"] == startId),r)
paste0("0,",sum(alts[selection]))
}))
ad.new <- as.matrix(res)
> ad.new
[,1]
[1,] "0,53"
[2,] "34,6"
[3,] "24,0"
[4,] "0,30"
[5,] "0,12"
[6,] "0,13"
[7,] "34,7"

Resources