Calculating fraction, which is in column as string

Calculating fraction, which is in column as string - r

I have a data.frame like this
z <- structure(list(ID = c("R-HSA-977606", "R-HSA-977443", "R-HSA-166658",
"R-HSA-166663", "R-HSA-1236394", "R-HSA-390522", "R-HSA-3232118",
"R-HSA-1630316", "R-HSA-112315", "R-HSA-112314"), GeneRatio = c("6/189",
"6/189", "6/189", "4/189", "5/189", "4/189", "3/189", "7/189",
"11/189", "9/189")), row.names = c("R-HSA-977606", "R-HSA-977443",
"R-HSA-166658", "R-HSA-166663", "R-HSA-1236394", "R-HSA-390522",
"R-HSA-3232118", "R-HSA-1630316", "R-HSA-112315", "R-HSA-112314"
), class = "data.frame")
Is it possible to add a 3rd column with the ratio from the 2nd column calculated? i.e. 6/189=0.0317. So in the third column I should have 0.0317.

As it is a string expression, we can use eval/parse
z$newColumn <- sapply(z$GeneRatio, function(x) eval(parse(text = x)))
-output
> z
ID GeneRatio newColumn
R-HSA-977606 R-HSA-977606 6/189 0.03174603
R-HSA-977443 R-HSA-977443 6/189 0.03174603
R-HSA-166658 R-HSA-166658 6/189 0.03174603
R-HSA-166663 R-HSA-166663 4/189 0.02116402
R-HSA-1236394 R-HSA-1236394 5/189 0.02645503
R-HSA-390522 R-HSA-390522 4/189 0.02116402
R-HSA-3232118 R-HSA-3232118 3/189 0.01587302
R-HSA-1630316 R-HSA-1630316 7/189 0.03703704
R-HSA-112315 R-HSA-112315 11/189 0.05820106
R-HSA-112314 R-HSA-112314 9/189 0.04761905
Or a faster option would be to split by / (or use read.table to create two columns and then divide (assuming the expression includes only division)
z$newColumn <- Reduce(`/`, read.table(text = z$GeneRatio,
header = FALSE, sep = "/"))

This code could be refined but it will work with the eval function
# 1- Creating empty column
z$GeneRatioNum <- NA
# 2- Filling it with eval function
for(i in 1:nrow(z)){z$GeneRatioNum[i] <- (eval(parse(text = z$GeneRatio[i])))}

Related

Sort dataframe by row index value without changing values

I have a dataframe that I have to sort in decreasing order of absolute row value without changing the actual values (some of which are negative).
To give you an example, e.g. for the 1st row, I would like to go from
-0.01189179 0.03687456 -0.12202753 to
-0.12202753 0.03687456 -0.01189179.
For the 2nd row from
-0.04220260 0.04129326 -0.07178175 to
-0.07178175 -0.04220260 0.04129326 etc.
How can I do this in R?
Many thanks!

Try this
lst <- lapply(df , \(x) order(-abs(x)))
ans <- data.frame(Map(\(x,y) x[y] , df ,lst))
output
a b
1 -0.01189179 -0.07178175
2 0.03687456 -0.04220260
3 -0.12202753 0.04129326
data
df <- structure(list(a = c(-0.12202753, 0.03687456, -0.01189179), b = c(-0.0422026,
0.04129326, -0.07178175)), row.names = c(NA, -3L), class = "data.frame")

Here is a simple approach (using #Mohamed Desouky's Data)
df <- df[nrow(df):1,]
> df
a b
3 -0.01189179 -0.07178175
2 0.03687456 0.04129326
1 -0.12202753 -0.04220260

Trying to convert Ensembl ID to gene name in R (biomaRt)

I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here, but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!
library(biomaRt)
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
The problems start here:
#to convert gene accession number to gene name
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
theBM = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")
# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
# (x - mean(x)) / sd(x)
# }
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18",
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3",
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533,
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352,
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626,
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976,
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104,
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289,
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218,
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198,
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894,
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435,
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702,
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696,
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364,
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")
Here's some output (where I'm struggling):
> head(charg)
[1] "ENSG00000261150.2" "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3" "ENSG00000124145.6"
> dim(theBM)
[1] 0 1
> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)
> dim(resdata)
[1] 20381 11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1] 0 11 #isn't correct -- just row names! where'd my genes go?
Edit: Problems solved! Turns out I was referencing getBM wrong. Thank you all!

If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:
library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")
resdata[1] = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata
(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)

Using tidyverse, how can you generate new columns dynamically?

Say I have a dataframe of tens of columns, and my custom function needs each one of these columns plus a number in a vector to give me the desired output. After being done with all that, I need to generate new column names based on the original column names in the dataframe. How to accomplish this using the tidyverse, instead of for loops or other solutions in base R.
MWE
structure(list(col1 = c(36.0520583373645, 37.9423749063706, 33.6806634587719,
34.031649012457, 29.5448679963449, NA, 34.7576769718877, 30.484217745574,
32.9849083643022, 27.4081694831058, 35.8624919654559, 35.0284347997991,
NA, 32.112605893241, 27.819354948082, 35.6499532124921, 35.0265642403216,
32.4006569441297, 30.3698557864842, 31.8229364456928, 34.3715903109276
), col2 = c(32.9691195198199, 35.6643664156284, 33.8748732989736,
34.5436311813644, 33.2228201914256, 38.7621696867191, 34.8399804318992,
32.9063078995457, 35.7391166214367, 32.7217251282669, 36.3039268989853,
35.9607654868559, 33.1385915196435, 34.7987649028199, 33.7100463668523,
34.7773403671057, 35.8592997980752, 33.8537127786535, 31.9106243803505,
39.3099469314882, 35.1849826815196), col3 = c(33.272278716963,
NA, 31.8594920410129, 33.1695042551974, 29.3800694974438, 35.1504378875245,
34.0771487001433, 29.0162879030415, 30.6960024888799, 29.5542117965184,
34.3726321365982, 36.0602274148362, 33.1207772548047, 31.5506876209822,
28.8649303491974, 33.4598790144265, 30.5573454464747, 31.6026723913051,
30.4716061556625, 33.009463000301, 30.846230953425)), row.names = c(NA,
-21L), class = "data.frame")
save above in a file, and then use example <- dget(file.choose()) to read the above dataframe.
Code
y <- c (2, 1, 1.5)
customfun <- function(x, y){
n <- log (x) * y
print (n)
}
df <- example %>%
dplyr::mutate(col1.log = customfun (col1, y = y[1])) %>%
dplyr::mutate(col2.log = customfun (col2, y = y[2])) %>%
dplyr::mutate(col3.log = customfun (col3, y = y[3]))
Question
Imagine I have tens of these columns not only 3 as in the MWE, how to generate the new ones dynamically using the tidyverse?

We can use map2 and bind_cols to add new columns
library(dplyr)
library(purrr)
bind_cols(example, map2_df(example, y, customfun) %>%
rename_all(~paste0(., ".log")))
# col1 col2 col3 col1.log col2.log col3.log
#1 36.05206 32.96912 33.27228 7.169928 3.495571 5.257087
#2 37.94237 35.66437 NA 7.272137 3.574152 NA
#3 33.68066 33.87487 31.85949 7.033848 3.522674 5.192003
#4 34.03165 34.54363 33.16950 7.054582 3.542223 5.252446
#...

tidyverse is not great for these sweep()-like operations, however, one option could be:
example %>%
do(., sweep(., 2, FUN = customfun, y)) %>%
rename_all(~ paste(., "log", sep = "."))
col1.log col2.log col3.log
1 7.169928 3.495571 5.257087
2 7.272137 3.574152 NA
3 7.033848 3.522674 5.192003
4 7.054582 3.542223 5.252446
5 6.771820 3.503237 5.070475
6 NA 3.657445 5.339456
7 7.096801 3.550766 5.292941
8 6.834418 3.493664 5.051786
9 6.992100 3.576246 5.136199
10 6.621682 3.488039 5.079339

How to extract part of cell value across columns?

I have a data frame like this:
df1<-structure(list(q006_1 = c("1098686880", "18493806","9892464","96193586",
"37723803","13925456","37713534","1085246853"),
q006_2 = c("1098160170","89009521","9726314","28076230","63451251",
"1090421499","37124019"),
q006_3 = c("52118967","41915062","1088245358","79277706","91478662",
"80048634")),
class=data.frame, row.names = c(NA, -8L)))
I know how to extract the last five digits of each number for one column using substr in data.table but I want to do it across all columns.
n_last <- 5
df1[, `q006_1`:= substr(q006_1, nchar(q006_1) - n_last + 1, nchar(q006_1))]
How can I do this for all columns?

In data.table it can be done like below: (Your sample data was incomplete as the first column had 8, second column had 7 and the third had 6 entries.)
library(data.table)
#or `cols <- names(df1)` if you want to apply it on all columns and this is not just an example
cols <- c("q006_1", "q006_2", "q006_3")
setDT(df1)[ , (cols):= lapply(.SD, function(x){
sub('.*(?=.{5}$)', '', x, perl=T)}),
.SDcols = cols][]
# q006_1 q006_2 q006_3
# 1: 86880 60170 18967
# 2: 93806 09521 15062
# 3: 92464 26314 45358
# 4: 93586 76230 77706
# 5: 23803 51251 78662
# 6: 25456 21499 48634
# 7: 13534 24019 76230
# 8: 46853 76230 76230
Data:
df1<-structure(list(q006_1 = c("1098686880", "18493806","9892464","96193586",
"37723803","13925456","37713534","1085246853"),
q006_2 = c("1098160170","89009521","9726314","28076230",
"63451251","1090421499","37124019","28076230"),
q006_3 = c("52118967","41915062","1088245358","79277706",
"91478662","80048634","28076230","28076230")),
class = c("data.frame"), row.names = c(NA, -8L))

Reformatting downloaded Excel data

I have downloaded some GDP data in .xls-format from the OECD website. However, to make this data workable in R, I need to reformat the data to a .csv file. More specifically, I need the year, day and month in the first column, and after the comma I need the GDP values (for example: 1990-01-01, 234590).
The column with GDP values can be easily copied and transposed, but how does one quickly add dates? Is there a fast way to do this, without having to add in the dates manually?
Thanks for the help!
Best,
Sean
PS. Link to (one of) the specific OECD files: https://ufile.io/8ogav or https://stats.oecd.org/index.aspx?queryid=350#
PSS. I have now changed the file to this:
Which I would like to transform into the same style as example 1.
Codes that I use for reading in data:
gdp.start <- c(1970,1) # type "double"
gdp.end <- c(2018,1)
gdp.raw <- "rawData/germany_gdp.csv"
gdp.table <- read.table(gdp.raw, skip = 1, header = F, sep = ',', stringsAsFactors = F)
gdp.ger <- ts(gdp.table[,2], start = gdp.start, frequency = 4) # time-series representation
PSS.
dput(head(gdp.table))
structure(list(V1 = c("Q2-1970;1.438.810 ", "Q3-1970;1.465.684 ",
"Q4-1970;1.478.108 ", "Q1-1971;1.449.712 ", "Q2-1971;1.480.136 ",
"Q3-1971;1.505.743 ")), row.names = c(NA, 6L), class = "data.frame")

Using your data:
z <- structure(list(V1 = c("Q2-1970;1.438.810 ", "Q3-1970;1.465.684 ",
"Q4-1970;1.478.108 ", "Q1-1971;1.449.712 ", "Q2-1971;1.480.136 ",
"Q3-1971;1.505.743 ")), row.names = c(NA, 6L), class = "data.frame")
dat <- read.csv2(text=paste(z$V1, collapse='\n'), stringsAsFactors=FALSE, header=FALSE)
dat
# V1 V2
# 1 Q2-1970 1.438.810
# 2 Q3-1970 1.465.684
# 3 Q4-1970 1.478.108
# 4 Q1-1971 1.449.712
# 5 Q2-1971 1.480.136
# 6 Q3-1971 1.505.743
and a simple function to replace quarters with the first date of each quarter
quarters <- function(s, format) {
qs <- c("Q1","Q2","Q3","Q4")
dts <- c("01-01", "04-01", "07-01", "10-01")
for (i in seq_along(qs))
s <- sub(qs[i], dts[i], s)
if (! missing(format))
s <- as.Date(s, format=format)
s
}
We can change them into strings of dates, preserving the order:
str(quarters(dat$V1))
# chr [1:6] "04-01-1970" "07-01-1970" "10-01-1970" "01-01-1971" ...
or we can convert into Date objects by setting the format:
str( quarters(dat$V1, format='%m-%d-%Y') )
# Date[1:6], format: "1970-04-01" "1970-07-01" "1970-10-01" "1971-01-01" ...
so replacing the column with the actual Date object is simply dat$V1 <- quarters(dat$V1, format='%m-%d-%Y').