Merging data frames by selecting for correct value - r

I have a data frame called "ref" that contains information that allows mapping of gene entrez ID to the gene's start and end positions. I have another data frame "ori_data" where each row contains unique mutations from samples, which gives a genomic position. I am trying to assign each position given in "ori_data" to map to information on "ref" in order to assign entrez ID to each mutation. I have tried a for loop to match for the same chromosome, and then select for positions in "ori_data" that fall between the coordinates in "ref" though I have not been successful. The "ori_data" dataset is over 1 million rows, so I'm not sure a for loop is an efficient solution. Note that many positions will be mapped to the same entrez ID in my real dataset. "Final" is what I want to happen- which would just add a column for entrezID according to chromosome/position. TYIA!
ref = data.frame("EntrezID" = c(1, 10, 100, 1000), "Chromosome" = c("19", "8", "20", "18"), "txStarts" = c("58345182", "18391281", "44619518", "27950965"), "txEnds" = c("58353492", "18401215", "44651758", "28177130"))
ori_data = data.frame("Chromosome" = c("19", "8", "20", "18"), "Pos" = c("58345186", "18401213", "44619519", "27950966"),
"Sample" = c("HCC1", "HCC2", "HCC1", "HCC3"))
final = data.frame("Chromosome" = c("19", "8", "20", "18"), "Pos" = c("58345186", "18401213", "44619519", "27950966"),
"Sample" = c("HCC1", "HCC2", "HCC1", "HCC3"), "EntrezID" = c(1,10,100,1000))
I have tried this line of code and I'm unsure as to why it does not work.
for (i in 1:dim(ori_data)[1])
{
for (j in 1:dim(ref)[1])
{
ID = which(ori_data[i, "Chromosome"] == ref[j,
"Chromosome"])
if (length(ID) > 0)
{
Pos = ori_data[ID, "POS"]
IDj = which(Pos >= ref[j, "txStarts"] & Pos <=
ref[j, "txEnds"])
print(IDj)
if (length(IDj) > 0)
{
ori_data = cbind("Entrez" = ref[IDj,
"EntrezID"], ori_data)
}
}
}
}

In base apply could be used to find matches per row for Chromosome and test if Pos is in the range of txStarts txEnds.
ori_data$EntrezID <- apply(ori_data[c("Chromosome", "Pos")], 1, \(x)
ref$EntrezID[ref$Chromosome == x["Chromosome"] &
x["Pos"] >= ref$txStarts & x["Pos"] <= ref$txEnds][1])
ori_data
# Chromosome Pos Sample EntrezID
#1 19 58345186 HCC1 1
#2 8 18401213 HCC2 10
#3 20 44619519 HCC1 100
#4 18 27950966 HCC3 1000
A version which could be faster:
lup <- list2env(split(ref[c("EntrezID", "txStarts", "txEnds")], ref$Chromosome))
ori_data$EntrezID <- Map(\(x, y) {
. <- get(x, envir=lup)
.$EntrezID[y >= .$txStarts & y <= .$txEnds][1]
}, ori_data$Chromosome, ori_data$Pos)
Or another way but not keeping the original order. (If original order is important, have a look at unsplit.)
#Assuming you have many rows with same Chromosome
x <- split(ori_data, ori_data$Chromosome)
#Assuming you have also here many rows with same Chromosome
lup <- split(ref[c("EntrezID", "txStarts", "txEnds")], ref$Chromosome)
#Now I am soting this by the names of x - try which Method ist faster
#Method 1:
lup <- lup[names(x)]
#Method 2:
lup <- mget(names(x), list2env(lup))
res <- do.call(rbind, Map(\(a, b) {
cbind(a, b[1][a$Pos >= b[[2]] & a$Pos <= b[[3]]][1])
}, x, lup))

One option would be to use sqldf, which should also be efficient for a large dataframe.
library(tibble)
library(sqldf)
as_tibble(sqldf("select dna.*, ref.EntrezID from dna
join ref on dna.Pos > ref.'txStarts' and
dna.Pos < ref.'txEnds'"))
Another option using fuzzy_join:
library(dplyr)
library(fuzzyjoin)
dna %>%
fuzzy_join(ref %>% select(-Chromosome), by = c("Pos" = "txStarts", "Pos" = "txEnds"),
match_fun = list(`>`, `<`)) %>%
select(names(dna), EntrezID)
Output
Chromosome Pos Sample EntrezID
1 19 58345186 HCC1 1
2 8 18401213 HCC2 10
3 20 44619519 HCC1 100
4 18 27950966 HCC3 1000

If the 'Pos', 'txStarts', 'txEnds' are numeric, then we can use non-equi join
library(data.table)
setDT(dna)[ref, EntrezID := i.EntrezID,
on = .(Chromosome, Pos > txStarts, Pos <txEnds)]
-output
> dna
Chromosome Pos Sample EntrezID
<char> <num> <char> <num>
1: 19 58345186 HCC1 1
2: 8 18401213 HCC2 10
3: 20 44619519 HCC1 100
4: 18 27950966 HCC3 1000
data
dna <- type.convert(dna, as.is = TRUE)
ref <- type.convert(ref, as.is = TRUE)

Related

How to find Mann-whitney p value across all genes using R?

I need to perform mann-whitney test across all genes using R programming. I need to input a text file where first row contains samples, second row contains cohort variables (1 or 2),all other rows contains the gene expressions. This I need to do using function.
The output at the end should be a table with the results: genes in the rows, columns with mean expression of cohort 1, mean expression of cohort 2, FC and mann-whitney p value.
This is what I tried using a demo data but it doesn't seem to be working. I get only G4 as gene in the row and "NAN" without any values in the rows for columns of mean expression of cohort 1, mean expression of cohort 2, FC and mann-whitney p value
data <- read.table(text = "
Cohort Gene S1 S2 S3 S4 S5
1 G1 1389 1097 1501 4630 2011
2 G2 1023 880 492 4411 1233
1 G3 2847 2717 2814 4145 5433
2 G4 20612 18123 17679 4099 8567
", header = TRUE)
#separate cohort 1 and 2
cohort1<-data[data$Cohort != "2", 1]
#head(cohort1)
cohort2<-data[data$Cohort != "1", 1]
geneNames <- data$Gene
row.names(data) <- data$Gene
df <- data.frame()
for (Gene in 1:length(geneNames)){
if (sum(cohort1) | sum(cohort2) > 0){
mwt <- wilcox.test(x = cohort1, y = cohort2, paired = T, exact = F, conf.int = F)
} else if (sum(cohort1) | sum(cohort2) == 0){
mwt <- data.frame("p.value" = NA, "conf.int" = NA)
}
table <- data.frame("Gene" = geneNames[Gene],"Mean_Cohort1" = mean(cohort1),
"Mean_Cohort2" = mean(cohort2),"FC" = mean(cohort1)/mean(cohort2), "MW_Pvalue" = mwt$p.value)
output <- rbind(df, table)
}
Can anybody help me out with this?

Replacing only NA values in xts object column wise using specific formula

I want to replace NA values in my xts object with formula Beta * Exposure * Index return.
My xts object is suppose Position_SimPnl created below:
library(xts)
df1 <- data.frame(Google = c(NA, NA, NA, NA, 500, 600, 700, 800),
Apple = c(10, 20,30,40,50,60,70,80),
Audi = c(1,2,3,4,5,6,7,8),
BMW = c(NA, NA, NA, NA, NA, 6000,7000,8000),
AENA = c(50,51,52,53,54,55,56,57))
Position_SimPnl <- xts(df1, order.by = Sys.Date() - 1:8)
For Beta there is a specific dataframe:
Beta_table <- data.frame (AENA = c(0.3,0.5,0.6), Apple = c(0.2,0.5,0.8), Google = c(0.1,0.3,0.5), Audi = c(0.4,0.6,0.7), AXP = c(0.5,0.7, 0.9), BMW = c(0.3,0.4, 0.5))
rownames(Beta_table) <- c(".SPX", ".FTSE", ".STOXX")
For exposure there is another dataframe:
Base <- data.frame (RIC = c("AENA","BMW","Apple","Audi","Google"), Exposure = c(100,200,300,400,500))
For Index return there is a xts object (Index_FX_Returns):
df2 <- data.frame(.SPX = c(0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08),
.FTSE = c(0.5, 0.4,0.3,0.2,0.3,0.4,0.3,0.4),
.STOXX = c(0.15,0.25,0.35,0.3,0.45,0.55,0.65,0.5))
Index_FX_Returns <- xts(df2,order.by = Sys.Date() - 1:8)
Also there is a dataframe which links RIC with Index:
RIC_Curr_Ind <- data.frame(RIC = c("AENA", "Apple", "Google", "Audi", "BMW"), Currency = c("EUR.","USD.","USD.","EUR.","EUR."), Index = c(".STOXX",".SPX",".SPX",".FTSE",".FTSE"))
What I want is for a particular position of NA value in Position_SimPnl it should look into the column name and get the corresponding index name from RIC_Curr_Ind dataframe and then look for the beta value from Beta_table by matching column name (column name of NA) and row name (index name derived from column name of NA).
Then again by matching the column name from Position_SimPnl with the RIC column from 'Base' dataframe it would extract the corresponding exposure value.
Then by matching column name from Position_SimPnl with RIC column from RIC_Curr_Ind dataframe, it would get the corresponding index name and from that index name it would look into the column name for xts object Index_FX_Returns and get the corresponding return value for the same date as of the NA value.
After getting the Beta, Exposure and Index return values I want the NA value to be replaced by formula: Beta * Exposure * Index return. Also I want only the NA values in Position_SimPnl to be replaced. the other values should remain as it was previously.I used the following formula for replacing the NA values:
do.call(merge, lapply(Position_SimPnl, function(y) {if(is.na(y)){y = (Beta_table[match(RIC_Curr_Ind$Index[match(colnames(y),RIC_Curr_Ind$RIC)],rownames(Beta_table)), match(colnames(y),colnames(Beta_table))]) * (Base$Exposure[match(colnames(y), Base$RIC)]) * (Index_FX_Returns[,RIC_Curr_Ind$Index[match(colnames(y),RIC_Curr_Ind$RIC)]])} else{y}}))
However in the output, if a particular column contains NA it is replacing all the values in the column (including which were not NA previously). Also I am getting multiple warning messages like
"In if (is.na(y)) { ... :
the condition has length > 1 and only the first element will be used".
I think because of this all values of column are getting transformed including non-NA ones. Can anyone suggest how to effectively replace these NA values by the formula mentioned above, keeping the other values same. Any help would be appreciated
Because you need to combine all data sets to achieve your formula Beta * Exposure * Index, consider building a master data frame comprised of all needed components. However, you face two challenges:
different data types (xts objects and data frame)
different data formats (wide and long formats)
For proper merging and calculating, consider converting all data components into data frames and reshaping to long format (i.e., all but Base and RIC_Curr_Ind). Then, merge and calculate with ifelse to fill NA values. Of course, at the end, you will have to reshape back to wide and convert back to XTS.
Reshape
# USER-DEFINED METHOD GIVEN THE MULTIPLE CALLS
proc_transpose <- function(df, col_pick, val_col, time_col) {
reshape(df,
varying = names(df)[col_pick],
times = names(df)[col_pick], ids = NULL,
v.names = val_col, timevar = time_col,
new.row.names = 1:1E4, direction = "long")
}
# POSITIONS
Position_SimPnl_wide_df <- data.frame(date = index(Position_SimPnl),
coredata(Position_SimPnl))
Position_SimPnl_long_df <- proc_transpose(Position_SimPnl_wide_df, col_pick = -1,
val_col = "Position", time_col = "RIC")
# BETA
Beta_table_long_df <- proc_transpose(transform(Beta_table, Index = row.names(Beta_table)),
col_pick = 1:ncol(Beta_table),
val_col = "Beta", time_col = "RIC")
# INDEX
Index_FX_Returns_wide_df <- data.frame(date = index(Index_FX_Returns),
coredata(Index_FX_Returns))
Index_FX_Returns_long_df <- proc_transpose(Index_FX_Returns_wide_df, col = -1,
val_col = "Index_value", time_col = "Index")
Merge
# CHAIN MERGE
master_df <- Reduce(function(...) merge(..., by="RIC"),
list(Position_SimPnl_long_df,
Beta_table_long_df,
Base)
)
# ADDITIONAL MERGES (NOT INCLUDED IN ABOVE CHAIN DUE TO DIFFERENT by)
master_df <- merge(master_df,
Index_FX_Returns_long_df, by=c("Index", "date"))
master_df <- merge(master_df,
RIC_Curr_Ind, by=c("Index", "RIC"))
Calculation
# FORMULA: Beta * Exposure * Index
master_df$Position <- with(master_df, ifelse(is.na(Position),
Beta * Exposure * Index_value,
Position))
Final Preparation
# RE-ORDER ROWS AND SUBSET COLS
master_df <- data.frame(with(master_df, master_df[order(RIC, date),
c("date", "RIC", "Position")]),
row.names = NULL)
# RESHAPE WIDE (REVERSE OF ABOVE)
Position_SimPnl_new <- setNames(reshape(master_df, idvar = "date",
v.names = "Position", timevar = "RIC",
direction = "wide"),
c("date", unique(master_df$RIC)))
# CONVERT TO XTS
Position_SimPnl_new <- xts(transform(Position_SimPnl_new, date = NULL),
order.by = Position_SimPnl_new$date)
Position_SimPnl_new
# AENA Apple Audi BMW Google
# 2019-11-27 58 80 8 8000 800.0
# 2019-11-28 57 70 7 7000 700.0
# 2019-11-29 56 60 6 6000 600.0
# 2019-11-30 55 50 5 24 500.0
# 2019-12-01 54 40 4 16 2.0
# 2019-12-02 53 30 3 24 1.5
# 2019-12-03 52 20 2 32 1.0
# 2019-12-04 51 10 1 40 0.5

How to extract part of cell value across columns?

I have a data frame like this:
df1<-structure(list(q006_1 = c("1098686880", "18493806","9892464","96193586",
"37723803","13925456","37713534","1085246853"),
q006_2 = c("1098160170","89009521","9726314","28076230","63451251",
"1090421499","37124019"),
q006_3 = c("52118967","41915062","1088245358","79277706","91478662",
"80048634")),
class=data.frame, row.names = c(NA, -8L)))
I know how to extract the last five digits of each number for one column using substr in data.table but I want to do it across all columns.
n_last <- 5
df1[, `q006_1`:= substr(q006_1, nchar(q006_1) - n_last + 1, nchar(q006_1))]
How can I do this for all columns?
In data.table it can be done like below: (Your sample data was incomplete as the first column had 8, second column had 7 and the third had 6 entries.)
library(data.table)
#or `cols <- names(df1)` if you want to apply it on all columns and this is not just an example
cols <- c("q006_1", "q006_2", "q006_3")
setDT(df1)[ , (cols):= lapply(.SD, function(x){
sub('.*(?=.{5}$)', '', x, perl=T)}),
.SDcols = cols][]
# q006_1 q006_2 q006_3
# 1: 86880 60170 18967
# 2: 93806 09521 15062
# 3: 92464 26314 45358
# 4: 93586 76230 77706
# 5: 23803 51251 78662
# 6: 25456 21499 48634
# 7: 13534 24019 76230
# 8: 46853 76230 76230
Data:
df1<-structure(list(q006_1 = c("1098686880", "18493806","9892464","96193586",
"37723803","13925456","37713534","1085246853"),
q006_2 = c("1098160170","89009521","9726314","28076230",
"63451251","1090421499","37124019","28076230"),
q006_3 = c("52118967","41915062","1088245358","79277706",
"91478662","80048634","28076230","28076230")),
class = c("data.frame"), row.names = c(NA, -8L))

How to search part of string that contain in a list of string, and return the matched one in R

The following data frame contain a "Campaign" column, the value of column contain information about season, name, and position, however, the order of these information are quiet different in each row. Lucky, these information is a fixed list, so we could create a vector to match the string inside the "Campaign_name" column.
Date Campaign
1 Jan-15 Summer|Peter|Up
2 Feb-15 David|Winter|Down
3 Mar-15 Up|Peter|Spring
Here is what I want to do, I want to create 3 columns as Name, Season, Position. So these column can search the string inside the campaign column and return the matched value from the list below.
Name <- c("Peter, David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
So my desired result would be following
Temp
Date Campaign Name Season Position
1 15-Jan Summer|Peter|Up Peter Summer Up
2 15-Feb David|Winter|Down David Winter Down
3 15-Mar Up|Peter|Spring Peter Spring Up
Another way:
L <- strsplit(df$Campaign,split = '\\|')
df$Name <- sapply(L,intersect,Name)
df$Season <- sapply(L,intersect,Season)
df$Position <- sapply(L,intersect,Position)
Do the following:
Date = c("Jan-15","Feb-15","Mar-15")
Campaign = c("Summer|Peter|Up","David|Winter|Down","Up|Peter|Spring")
df = data.frame(Date,Campaign)
Name <- c("Peter", "David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
for(k in Name){
df$Name[grepl(pattern = k, x = df$Campaign)] <- k
}
for(k in Season){
df$Season[grepl(pattern = k, x = df$Campaign)] <- k
}
for(k in Position){
df$Position[grepl(pattern = k, x = df$Campaign)] <- k
}
This gives:
> df
Date Campaign Name Season Position
1 Jan-15 Summer|Peter|Up Peter Summer Up
2 Feb-15 David|Winter|Down David Winter Down
3 Mar-15 Up|Peter|Spring Peter Spring Up
I had the same idea as Marat Talipov; here's a data.table option:
library(data.table)
Name <- c("Peter", "David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
dat <- data.table(Date=c("Jan-15", "Feb-15", "Mar-15"),
Campaign=c("Summer|Peter|Up", "David|Winter|Down", "Up|Peter|Spring"))
Gives
> dat
Date Campaign
1: Jan-15 Summer|Peter|Up
2: Feb-15 David|Winter|Down
3: Mar-15 Up|Peter|Spring
Processing is then
dat[ , `:=`(Name = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Name),
Season = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Season),
Position = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Position))
]
Result:
> dat
Date Campaign Name Season Position
1: Jan-15 Summer|Peter|Up Peter Summer Up
2: Feb-15 David|Winter|Down David Winter Down
3: Mar-15 Up|Peter|Spring Peter Spring Up
Maybe there's some benefit if you're doing this to a lot of columns or need to modify in place (by reference).
I'm interested if anyone can show me how to update all three columns at once.
EDIT: Never mind, figured it out;
for (icol in c("Name", "Season", "Position"))
dat[, (icol):=sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, get(icol))]

Extracting Column data from .csv and turning every 10 consecutive rows into corresponding columns

Below is the code I am trying to implement. I want to extract this 10 consecutive values of rows and turn them into corresponding columns .
This is how data looks like: https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing
I have been trying but temp1 and temp2 comes out to be empty. Please help.
library(Hmisc) #for increment function
myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file
extract<-myData$V2 # extracting the desired column
x<-1
y<-1
temp1 <- NULL #initialisation
temp2 <- NULL #initialisation
data.sorted <- NULL #initialisation
limit<-nrow(myData) # Calculating no of rows
while (x! = limit) {
count <- 1
for (count in 11) {
if (count > 10) {
inc(x) <- 1
break # gets out of for loop
}
else {
temp1[y]<-data_mat[x] # extracting by every row element
}
inc(x) <- 1 # increment x
inc(y) <- 1 # increment y
}
temp2<-temp1
data.sorted<-rbind(data.sorted,temp2) # turn rows into columns
}
Your code is too complex. You can do this using only one for loop, without external packages, likes this:
myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)
newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
start <- 11*i - 10
newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}
You don't actually need all this though. You can simply remove the empty lines, split the vector in chunks of size 10 (as explained here) and then turn the list into a data frame.
vec <- myData$V1[nchar(myData$V1)>0]
as.data.frame(split(vec, ceiling(seq_along(vec)/10)))
# X1 X2
# 1 a b
# 2 a b
# 3 a b
# 4 a b
# 5 a b
# 6 a b
# 7 a b
# 8 a b
# 9 a b
# 10 a b
We could create a numeric index based on the '' values in the 'V2' column, split the dataset, use Reduce/merge to get the columns in the wide format.
indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
# V1 Col1 Col2 Col3
#2 ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price unknown unknown unknown
From the p1.png, the 'V1' column can also be the column names for the values in 'V2'. If that is the case, we can 'transpose' the 'res1' except the first column and change the column names of the output with the first column of 'res1' (setNames(...))
res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE),
res1[,1])
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
# ProductId product_title product_price userid
#1 B000179R3I Amazon.com unknown A3Q0VJTU04EZ56
#2 B0000C3XXN Amazon.com unknown A34JM8F992M9N1
#3 B0000C3XX9 Amazon.com unknown A34JM8F993MN91
# profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala" 7/7 4 1182816000
#2 M. Shapiro 6/6 5 1205107200
#3 J. Cruze 8/8 5 120571929
# review_summary
#1 Periwinkle Dartmouth Blazer
#2 great classic jacket
#3 Good jacket
# review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2 This is the second time I bought this jacket
#3 This is the third time I bought this jacket
I guess this is just a reshaping issue. In that case, we can use dcast from data.table to convert from long to wide format
library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
value.var='V2')
data
myData <- structure(list(V1 = c("ProductId", "product_title",
"product_price",
"userid", "profileName", "helpfulness", "reviewscore", "review_time",
"review_summary", "review_text", "", "ProductId", "product_title",
"product_price", "userid", "profileName", "helpfulness",
"reviewscore",
"review_time", "review_summary", "review_text", "", "ProductId",
"product_title", "product_price", "userid", "profileName",
"helpfulness",
"reviewscore", "review_time", "review_summary", "review_text"
), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56",
"Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000",
"Periwinkle Dartmouth Blazer",
"I own the Austin Reed dartmouth blazer in every color", "",
"B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
"M. Shapiro",
"6/6", "5", "1205107200", "great classic jacket",
"This is the second time I bought this jacket",
"", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91",
"J. Cruze", "8/8", "5", "120571929", "Good jacket",
"This is the third time I bought this jacket"
)), .Names = c("V1", "V2"), row.names = c(NA, 32L),
class = "data.frame")

Resources