Related
I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here, but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!
library(biomaRt)
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
The problems start here:
#to convert gene accession number to gene name
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
theBM = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")
# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
# (x - mean(x)) / sd(x)
# }
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18",
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3",
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533,
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352,
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626,
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976,
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104,
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289,
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218,
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198,
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894,
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435,
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702,
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696,
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364,
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")
Here's some output (where I'm struggling):
> head(charg)
[1] "ENSG00000261150.2" "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3" "ENSG00000124145.6"
> dim(theBM)
[1] 0 1
> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)
> dim(resdata)
[1] 20381 11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1] 0 11 #isn't correct -- just row names! where'd my genes go?
Edit: Problems solved! Turns out I was referencing getBM wrong. Thank you all!
If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:
library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")
resdata[1] = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata
(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)
I have a data column that contains a bunch of ranges as strings (e.g. "2 to 4", "5 to 6", "7 to 8" etc.). I'm trying to create a new column that converts each of these values to a random number within the given range. How can I leverage conditional logic within my function to solve this problem?
I think the function should be something along the lines of:
df<-mutate(df, c2=ifelse(df$c=="2 to 4", sample(2:4, 1, replace=TRUE), "NA"))
Which should produce a new column in my dataset that replaces all the values of "2 to 4" with a random integer between 2 and 4, however, this is not working and replacing every value with "NA".
Ideally, I am trying to do something where the dataset:
df<-c("2 to 4","2 to 4","5 to 6")
Would add a new column:
df<-c2("3","2","5")
Does anyone have any idea how to do this?
We can split the string on "to" and create a range between the two numbers after converting them to numeric and then use sample to select any one of the number in range.
df$c2 <- sapply(strsplit(df$c1, "\\s+to\\s+"), function(x) {
vals <- as.integer(x)
sample(vals[1]:vals[2], 1)
})
df
# c1 c2
#1 2 to 4 2
#2 2 to 4 3
#3 5 to 6 5
data
df<- data.frame(c1 = c("2 to 4","2 to 4","5 to 6"), stringsAsFactors = FALSE)
We can do this easily with sub. Replace the to with : and evaluate to get the sequence, then get the sample of 1 from it
df$c2 <- sapply(sub(" to ", ":", df$c1), function(x)
sample(eval(parse(text = x)), 1))
df
# c1 c2
#1 2 to 4 4
#2 2 to 4 3
#3 5 to 6 5
Or with gsubfn
library(gsubfn)
as.numeric(gsubfn("(\\d+) to (\\d+)", ~ sample(seq(as.numeric(x),
as.numeric(y), by = 1), 1), df$c1))
Or with read.table/Map from base R
sapply(do.call(Map, c(f = `:`, read.csv(text = sub(" to ", ",", df$c1),
header = FALSE))), sample, 1)
data
df <- structure(list(c1 = c("2 to 4", "2 to 4", "5 to 6")),
class = "data.frame", row.names = c(NA, -3L))
I have downloaded some GDP data in .xls-format from the OECD website. However, to make this data workable in R, I need to reformat the data to a .csv file. More specifically, I need the year, day and month in the first column, and after the comma I need the GDP values (for example: 1990-01-01, 234590).
The column with GDP values can be easily copied and transposed, but how does one quickly add dates? Is there a fast way to do this, without having to add in the dates manually?
Thanks for the help!
Best,
Sean
PS. Link to (one of) the specific OECD files: https://ufile.io/8ogav or https://stats.oecd.org/index.aspx?queryid=350#
PSS. I have now changed the file to this:
Which I would like to transform into the same style as example 1.
Codes that I use for reading in data:
gdp.start <- c(1970,1) # type "double"
gdp.end <- c(2018,1)
gdp.raw <- "rawData/germany_gdp.csv"
gdp.table <- read.table(gdp.raw, skip = 1, header = F, sep = ',', stringsAsFactors = F)
gdp.ger <- ts(gdp.table[,2], start = gdp.start, frequency = 4) # time-series representation
PSS.
dput(head(gdp.table))
structure(list(V1 = c("Q2-1970;1.438.810 ", "Q3-1970;1.465.684 ",
"Q4-1970;1.478.108 ", "Q1-1971;1.449.712 ", "Q2-1971;1.480.136 ",
"Q3-1971;1.505.743 ")), row.names = c(NA, 6L), class = "data.frame")
Using your data:
z <- structure(list(V1 = c("Q2-1970;1.438.810 ", "Q3-1970;1.465.684 ",
"Q4-1970;1.478.108 ", "Q1-1971;1.449.712 ", "Q2-1971;1.480.136 ",
"Q3-1971;1.505.743 ")), row.names = c(NA, 6L), class = "data.frame")
dat <- read.csv2(text=paste(z$V1, collapse='\n'), stringsAsFactors=FALSE, header=FALSE)
dat
# V1 V2
# 1 Q2-1970 1.438.810
# 2 Q3-1970 1.465.684
# 3 Q4-1970 1.478.108
# 4 Q1-1971 1.449.712
# 5 Q2-1971 1.480.136
# 6 Q3-1971 1.505.743
and a simple function to replace quarters with the first date of each quarter
quarters <- function(s, format) {
qs <- c("Q1","Q2","Q3","Q4")
dts <- c("01-01", "04-01", "07-01", "10-01")
for (i in seq_along(qs))
s <- sub(qs[i], dts[i], s)
if (! missing(format))
s <- as.Date(s, format=format)
s
}
We can change them into strings of dates, preserving the order:
str(quarters(dat$V1))
# chr [1:6] "04-01-1970" "07-01-1970" "10-01-1970" "01-01-1971" ...
or we can convert into Date objects by setting the format:
str( quarters(dat$V1, format='%m-%d-%Y') )
# Date[1:6], format: "1970-04-01" "1970-07-01" "1970-10-01" "1971-01-01" ...
so replacing the column with the actual Date object is simply dat$V1 <- quarters(dat$V1, format='%m-%d-%Y').
I want to read a text file into R, but I got a problem that the first column are mixed with the column names and the first column numbers.
Data text file
revenues 4118000000.0, 4315000000.0, 4512000000.0, 4709000000.0, 4906000000.0, 5103000000.0
cost_of_revenue-1595852945.4985902, -1651829192.2662954, -1705945706.6237037, -1758202488.5708148, -1808599538.1076286, -1857136855.234145
gross_profit 2522147054.5014095, 2663170807.7337046, 2806054293.376296, 2950797511.429185, 3097400461.892371, 3245863144.765855
R Code:
data.predicted_values = read.table("predicted_values.txt", sep=",")
Output:
V1 V2 V3 V4 V5 V6
1 revenues 4118000000.0 4315000000 4512000000 4709000000 4906000000 5103000000
2 cost_of_revenue-1595852945.4985902 -1651829192 -1705945707 -1758202489 -1808599538 -1857136855
3 gross_profit 2522147054.5014095 2663170808 2806054293 2950797511 3097400462 3245863145
How can I split the first column into two parts? I mean I want the first column V1 is revenues,cost_of_revenue, gross_profit. V2 is 4118000000.0,-1595852945.4985902,2522147054.5014095. And so on and so forth.
This is along the same lines of thinking as #DWin's, but accounts for the negative values in the second row.
TEXT <- readLines("predicted_values.txt")
A <- gregexpr("[A-Za-z_]+", TEXT)
B <- read.table(text = regmatches(TEXT, A, invert = TRUE)[[1]], sep = ",")
C <- cbind(FirstCol = regmatches(TEXT, A)[[1]], B)
C
# FirstCol V1 V2 V3 V4 V5 V6
# 1 revenues 4118000000 4315000000 4512000000 4709000000 4906000000 5103000000
# 2 cost_of_revenue -1595852945 -1651829192 -1705945707 -1758202489 -1808599538 -1857136855
# 3 gross_profit 2522147055 2663170808 2806054293 2950797511 3097400462 3245863145
Since you have no commas btwn the rownames and the values you need to add them back in:
txt <- "revenues 4118000000.0, 4315000000.0, 4512000000.0, 4709000000.0, 4906000000.0, 5103000000.0
cost_of_revenue-1595852945.4985902, -1651829192.2662954, -1705945706.6237037, -1758202488.5708148, -1808599538.1076286, -1857136855.234145
gross_profit 2522147054.5014095, 2663170807.7337046, 2806054293.376296, 2950797511.429185, 3097400461.892371, 3245863144.765855"
Lines <- readLines( textConnection(txt) )
# replace textConnection(.) with `file = "predicted_values.txt"`
res <- read.csv( text=sub( "(^[[:alpha:][:punct:]]+)(\\s|-)" ,
"\\1,", Lines) ,
header=FALSE, row.names=1 )
res
The decimal fractions may not print but they are there.
You want the row.names argument of read.table. Then you can simply transpose your data:
data.predicted_values = read.table("predicted_values.txt", sep=",", row.names=1)
data.predicted_values <- t(data.predicted_values)
Very simple question. I am using an excel sheet that has two rows for the column headings; how can I convert these two row headings into one? Further, these headings don't start at the top of the sheet.
Thus, I have DF1
Temp Press Reagent Yield A Conversion etc
degC bar /g % %
1 2 3 4 5
6 7 8 9 10
and I want,
Temp degC Press bar Reagent /g Yield A % Conversion etc
1 2 3 4 5
6 7 8 9 10
Using colnames(DF1) returns the upper names, but getting the second line to merge with the upper one keeps eluding me.
Using your data, modified to quote text fields that contain the separator (get whatever tool you used to generate the file to quote text fields for you!)
txt <- "Temp Press Reagent 'Yield A' 'Conversion etc'
degC bar /g % %
1 2 3 4 5
6 7 8 9 10
"
this snippet of code below reads the file in two steps
First we read the data, so skip = 2 means skip the first 2 lines
Next we read the data again but only the first two line, this output is then further processed by sapply() where we paste(x, collapse = " ") the strings in the columns of the labs data frame. These are assigned to the names of dat
Here is the code:
dat <- read.table(text = txt, skip = 2)
labs <- read.table(text = txt, nrows = 2, stringsAsFactors = FALSE)
names(dat) <- sapply(labs, paste, collapse = " ")
dat
names(dat)
The code, when runs produces:
> dat <- read.table(text = txt, skip = 2)
> labs <- read.table(text = txt, nrows = 2, stringsAsFactors = FALSE)
> names(dat) <- sapply(labs, paste, collapse = " ")
>
> dat
Temp degC Press bar Reagent /g Yield A % Conversion etc %
1 1 2 3 4 5
2 6 7 8 9 10
> names(dat)
[1] "Temp degC" "Press bar" "Reagent /g"
[4] "Yield A %" "Conversion etc %"
In your case, you'll want to modify the read.table() calls to point at the file on your file system, so use file = "foo.txt" in place of text = txt in the code chunk, where "foo.txt" is the name of your file.
Also, if these headings don't start at the top of the file, then increase skip to 2+n where n is the number of lines before the two header rows. You'll also need to add skip = n to the second read.table() call which generates labs, where n is again the number of lines before the header lines.
This should work. You only need set stringsAsFactors=FALSE when reading data.
data <- structure(list(Temp = c("degC", "1", "6"), Press = c("bar", "2",
"7"), Reagent = c("/g", "3", "8"), Yield.A = c("%", "4", "9"),
Conversion = c("%", "5", "10")), .Names = c("Temp", "Press",
"Reagent", "Yield.A", "Conversion"), class = "data.frame", row.names = c(NA,
-3L)) # Your data
colnames(data) <-paste(colnames(dados),dados[1,]) # Set new names
data <- data[-1,] # Remove first line
data <- data.frame(apply(data,2,as.real)) # Correct the classes (works only if all collums are numbers)
Just load your file with read.table(file, header = FALSE, stringsAsFactors = F) arguments. Then, you can grep to find the position this happens.
df <- data.frame(V1=c(sample(10), "Temp", "degC"),
V2=c(sample(10), "Press", "bar"),
V3 = c(sample(10), "Reagent", "/g"),
V4 = c(sample(10), "Yield_A", "%"),
V5 = c(sample(10), "Conversion", "%"),
stringsAsFactors=F)
idx <- unique(c(grep("Temp", df$V1), grep("degC", df$V1)))
df2 <- df[-(idx), ]
names(df2) <- sapply(df[idx, ], function(x) paste(x, collapse=" "))
Here, if you want, you can then convert all the columns to numeric as follows:
df2 <- as.data.frame(sapply(df2, as.numeric))