how to set assignement to fill subset by row - r

While cleaning up a dataframe I found out that assignments into subsets works by columns and not by lines, an unfortunate result when doing dataset cleanup as you typically search cases of issues and then apply your correction across multiple lines.
# example table
releves <- structure(list(cult2015 = c("bp", "bp"), prec2015 = c("?", "?"
)), .Names = c("cult2015", "prec2015"), row.names = c(478L, 492L
), class = "data.frame")
# assignement to a subset
iBad2 <- which(releves$cult2015 == "bp" & releves$prec2015 == "?")
releves[iBad2,c("cult2015","prec2015")] <- c("b","p")
I understand that the "filling" of the matrices is done by columns and hence, the repetition of the provided vector is done on each column but is there any option to get: "b", "p" on each line and not:
> releves
cult2015 prec2015
478 b b
492 p p

I wrote the following function that does the job, at least in the cases I faced:
# allows to to assigment of newVals to a subset spanning over multiple rows
AssignToSubsetByRow <- function(dat,rows,cols,newVals){
if(is.null(dim(newVals))&length(rows)*length(cols)> length(newVals)){
fullRep <- rep(newVals,each=length(rows))
}else{
fullRep <- newVals
}
dat[rows,cols] <- fullRep
return(dat)
}
And doing the job fine:
releves <- AssignToSubsetByRow(releves,iBad2,c("cult2015","prec2015"),c("b","p"))
> releves
cult2015 prec2015
478 b p
492 b p

Related

Calculating fraction, which is in column as string

I have a data.frame like this
z <- structure(list(ID = c("R-HSA-977606", "R-HSA-977443", "R-HSA-166658",
"R-HSA-166663", "R-HSA-1236394", "R-HSA-390522", "R-HSA-3232118",
"R-HSA-1630316", "R-HSA-112315", "R-HSA-112314"), GeneRatio = c("6/189",
"6/189", "6/189", "4/189", "5/189", "4/189", "3/189", "7/189",
"11/189", "9/189")), row.names = c("R-HSA-977606", "R-HSA-977443",
"R-HSA-166658", "R-HSA-166663", "R-HSA-1236394", "R-HSA-390522",
"R-HSA-3232118", "R-HSA-1630316", "R-HSA-112315", "R-HSA-112314"
), class = "data.frame")
Is it possible to add a 3rd column with the ratio from the 2nd column calculated? i.e. 6/189=0.0317. So in the third column I should have 0.0317.
As it is a string expression, we can use eval/parse
z$newColumn <- sapply(z$GeneRatio, function(x) eval(parse(text = x)))
-output
> z
ID GeneRatio newColumn
R-HSA-977606 R-HSA-977606 6/189 0.03174603
R-HSA-977443 R-HSA-977443 6/189 0.03174603
R-HSA-166658 R-HSA-166658 6/189 0.03174603
R-HSA-166663 R-HSA-166663 4/189 0.02116402
R-HSA-1236394 R-HSA-1236394 5/189 0.02645503
R-HSA-390522 R-HSA-390522 4/189 0.02116402
R-HSA-3232118 R-HSA-3232118 3/189 0.01587302
R-HSA-1630316 R-HSA-1630316 7/189 0.03703704
R-HSA-112315 R-HSA-112315 11/189 0.05820106
R-HSA-112314 R-HSA-112314 9/189 0.04761905
Or a faster option would be to split by / (or use read.table to create two columns and then divide (assuming the expression includes only division)
z$newColumn <- Reduce(`/`, read.table(text = z$GeneRatio,
header = FALSE, sep = "/"))
This code could be refined but it will work with the eval function
# 1- Creating empty column
z$GeneRatioNum <- NA
# 2- Filling it with eval function
for(i in 1:nrow(z)){z$GeneRatioNum[i] <- (eval(parse(text = z$GeneRatio[i])))}

Trying to convert Ensembl ID to gene name in R (biomaRt)

I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here, but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!
library(biomaRt)
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
The problems start here:
#to convert gene accession number to gene name
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
theBM = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")
# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
# (x - mean(x)) / sd(x)
# }
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18",
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3",
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533,
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352,
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626,
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976,
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104,
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289,
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218,
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198,
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894,
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435,
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702,
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696,
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364,
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")
Here's some output (where I'm struggling):
> head(charg)
[1] "ENSG00000261150.2" "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3" "ENSG00000124145.6"
> dim(theBM)
[1] 0 1
> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)
> dim(resdata)
[1] 20381 11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1] 0 11 #isn't correct -- just row names! where'd my genes go?
Edit: Problems solved! Turns out I was referencing getBM wrong. Thank you all!
If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:
library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")
resdata[1] = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata
(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)

for loops and if conditional applied to data frames

If have two csv data frames data1 and data2 of dimension/size n1*n2 and m1*m2. I would like to create a new data frame consisting of differences: If (and only if)
data1[i,1] = data2[j,1] & data1[i,3] = data2[j,3]
then I want to consider
difference[i,z] <- abs(data1[i,x]-data2[i,y])
Is it possible to this in a simple manner, for instance using for/if?
difference <- matrix(nrow = max{n1,m1}, ncol = 3)
for (i in 1:n1) {
for (j in 1:m1) {
if(data1[i,1] == data2[j,1] & data1[i,3] == data2[j,3]){
difference[i,1] = data1[i,1]
difference[i,2] = data1[i,3]
difference[i,3] = data1[i,6]-data2[j,7]
}
}
This code is obviously far from being complete and I have several issues:
(1) I don't know if it is realizable using for loops/if conditional. If yes, being unfamiliar with R, I'm not sure if I need to put a 'print(something)' at the end of the loops.
(2) data1/2[i,1] is of type character. Hence I'm not sure if
data1[i,1] == data2[j,1] & data1[i,3] == data2[j,3]
is well-defined.
(3) The 'difference' matrix/frame should have as many rows as the number of i's and j's where
data1[i,1] = data2[j,1] & data1[i,3] = data2[j,3]
I do not know what this number is. Therefore I cannot really specify the size of 'difference'.
EDIT:
data1 = read.csv("path/to/data1.csv") ## Prices of 157 products each at
## 122 time points; (column1=Product, column3=date, column7=price)
data2 = read.csv("path/to/data2.csv") ## Prices of 118 products each at
## 122 time points; (column1=Product, column3=date, column6=price)
## the 122 time points are the same for both frames
## But: data1 contains some products data2 doesn't and vice versa
## I want to compare prices of the same products at the same time
So far, I've done it manually for product X1:
priceX1 = as.data.frame(data1[c(1,122),7])
priceX2 = as.data.frame(data2[c(5,126),6]) ## Product X2 starts at row 5
differenceX1 <- abs(priceX1 - priceX2)
The problem is I'd have to repeat this for all products contained in both data1 and data2.
RE-EDIT: dput(data1) returns
...), class = "factor"),
COMMENT = c(NA, ..., NA)), .Names = c("PRODUCT", "QUALIFIER_I",
"DATE", "QUALIFIER_II", "QUOTATION_DATE", "PROD_DATE", "PRICE",
"TYPE", "ID", "COMMENT"), row.names = c(NA, 14400L), class
= "data.frame")
"..." stands for me omitting a long list of products that couldn't fit here.
dput(data2) returns
..., NA, NA, NA)), .Names = c("PRODUCT", "QUALIFIER_II",
"DATE", "QUALIFIER_I", "Data2_source", "PRICE"), row.names = c(NA,
19161L), class = "data.frame")
"..." stand for me omitting a huge list of prices that couldn't fit in here.
You can find all pairs (i,j) which satisfy your condition by merging the two data.frames:
differences = merge(data1, data2, by=c('PRODUCT','DATE'))
This avoids for-loops entirely, and you can easily define the new column:
differences$Diff = abs(differences$PRICE.x - differences$PRICE.y)

Heatmap of gene subset from microarray expression data in R

I have a mircoarray dataset from the illumina beadchip platform which I have been using to examine differential expression between 3 treatment groups. Following background subtraction and normalisation I have a file of class "Elist" type - represented as below.
$E
A B C D E F
ILMN_1 9.678162 9.635665 9.420577 9.778417 9.521473 9.820778
ILMN_2 11.458221 11.152161 11.158666 11.410278 11.416522 11.377062
ILMN_3 9.385075 9.087426 9.230654 9.704379 9.720282 9.482488
ILMN_4 9.909423 9.115123 9.693177 10.348670 9.896625 9.729896
ILMN_5 11.826927 12.067796 12.165630 12.256113 12.061949 12.213470
$genes
SYMBOL
ILMN_1 Gene 1
ILMN_2 Gene 2
ILMN_3 Gene 3
ILMN_4 Gene 4
ILMN_5 Gene 5
I would now like to create an object of "Elist" class which includes only a subset of genes selected by their gene symbol with a view to generating a heatmap of the subset. ( I should be able to manage the heatmap from there)
eg
$E
A B C D E F
ILMN_2 11.458221 11.152161 11.158666 11.410278 11.416522 11.377062
ILMN_4 9.909423 9.115123 9.693177 10.348670 9.896625 9.729896
$genes
SYMBOL
ILMN_2 Gene 2
ILMN_4 Gene 4
I have tried
subset = Elist[Elist$genes == c("gene 2", "gene4"), ]
but this seems to only generate a subset of the first gene in the vector or occasionally several rows of NAs. If I inset just one gene into the vector it works fine.
subset = Elist[Elist$genes %in% c("gene 2", "gene4"), ]
returns an object of Elist class with no rows.
Any help much appreciated. (any advice on how to post the question better appreciated too!)
Many thanks - Vincents answer works very well - the solution was
subset = Eset[ Eset$genes$SYMBOL %in% c("Gene2", "Gene4"), ]
I would now like to make a heatmap of the gene subset firstly being able to order the columns myself into treatment groups and secondly replacing the row names with gene names rather than the probe name.
I am able the remove the clustering order using Colv but unable to get any further
heatmap.2(Subset$E, Colv = FALSE, Rowv = FALSE)
Any help much appreciated.
Let's call this object expr, instead of EList (the name of the class itself):
require(limma)
expr <- new("EList"
, .Data = list(structure(list(A = c(9.678162, 11.458221, 9.385075, 9.909423, 11.826927),
B = c(9.635665, 11.152161, 9.087426, 9.115123, 12.067796),
C = c(9.420577, 11.158666, 9.230654, 9.693177, 12.16563),
D = c(9.778417, 11.410278, 9.704379, 10.34867, 12.256113),
E = c(9.521473, 11.416522, 9.720282, 9.896625, 12.061949),
F = c(9.820778, 11.377062, 9.482488, 9.729896, 12.21347)),
.Names = c("A", "B", "C", "D", "E", "F"),
class = "data.frame",
row.names = c("ILMN_1", "ILMN_2", "ILMN_3", "ILMN_4", "ILMN_5")),
structure(list(SYMBOL = c("Gene1","Gene2", "Gene3", "Gene4", "Gene5")),
.Names = "SYMBOL",
row.names = c("ILMN_1","ILMN_2", "ILMN_3", "ILMN_4", "ILMN_5"),
class = "data.frame")))
We would like to select in the object the lines corresponding to genes 1 and 3.
A previous comment pointed to the right direction, the following should normally work:
expr[ expr$genes$SYMBOL %in% c("Gene2", "Gene4"), ]
Am I missing a question about heatmaps, I don't see any?

R - convert a data frame to a data set formatted as featureName:featureValue [duplicate]

This question already has answers here:
read/write data in libsvm format
(7 answers)
Closed 8 years ago.
It turns out the format I wanted is called "SVM-Light" and is described here http://svmlight.joachims.org/.
I have a data frame that I would like to convert to a text file with format as follows:
output featureIndex:featureValue ... featureIndex:featureValue
So for example:
t = structure(list(feature1 = c(3.28, 6.88), feature2 = c(0.61, 1.83
), output = c("1", "-1")), .Names = c("feature1", "feature2",
"output"), row.names = c(NA, -2L), class = "data.frame")
t
# feature1 feature2 output
# 1 3.28 0.61 1
# 2 6.88 1.83 -1
would become:
1 feature1:3.28 feature2:0.61
-1 feature1:6.88 feature2:1.83
My code so far:
nvars = 2
l = array("row", nrow(t))
for(i in(1:nrow(t)))
{
l = t$output[i]
for(n in (1:nvars))
{
thisFeatureString = paste(names(t)[n], t[[names(t)[n]]][i], sep=":")
l[i] = paste(l[i], thisFeatureString)
}
}
but I am not sure how to complete and write the results to a text file.
Also the code is probably not efficient.
Is there a library function that does this? as this kind of output format seems common for Vowpal Wabbit for example.
I couln't find a ready-made solution, although the svm-light data format seems to be widely used.
Here is a working solution (at least in my case):
############### CONVERT DATA TO SVM-LIGHT FORMAT ##################################
# data_frame MUST have a column 'target'
# target values are assumed to be -1 or 1
# all other columns are treated as features
###################################################################################
ConvertDataFrameTo_SVM_LIGHT_Format <- function(data_frame)
{
l = array("row", nrow(data_frame)) # l for "lines"
for(i in(1:nrow(data_frame)))
{
# we start each line with the target value
l[i] = data_frame$target[i]
# then append to the line each feature index (which is n) and its
# feature value (data_frame[[names(data_frame)[n]]][i])
for(n in (1:nvars))
{
thisFeatureString = paste(n, data_frame[[names(data_frame)[n]]][i], sep=":")
l[i] = paste(l[i], thisFeatureString)
}
}
return (l)
}
###################################################################################
If you don't mind not having the column names in the output, I think you could use a simple apply to do that:
apply(t, 1, function(x) paste(x, collapse=" "))
#[1] "3.28 0.61 1" "6.88 1.83 -1"
And to adjust the order of appearance in the output to your function's output you could do:
apply(t[c(3, 1, 2)], 1, function(x) paste(x, collapse=" "))
#[1] "1 3.28 0.61" "-1 6.88 1.83"

Resources