How to prepare data for Sankey plot in R - r

I want to create a Sankey plot from my data.
I do this as follows:
#Create the data
proc<-sample(c("EMR","RFA","Biopsies"), 100, replace = TRUE)
#Sample dates
dat<-sample(seq(as.Date('2013/01/01'), as.Date('2017/05/01'), by="day"), 100)
#Generate 20 random ID's in no particular order:
HospNum_Id<-sample(c("P433224","P633443","K522332","G244224","S553322","D0739033","U873352","P223333","Y763634","I927282","P223311","P029834","U22415","U234252","S141141","O349253","T622722","J322909","F630230","T432452"), 100, replace = TRUE)
df<-data.frame(proc,dat,HospNum_Id)
#Organize the flow of procedures:
library(data.table)
library(dplyr)
library(googleVis)
Sankey<-dcast(setDT(df)[, if(any(proc=="EMR"|proc=="RFA")) .SD, HospNum_Id], HospNum_Id~rowid(HospNum_Id), value.var ="proc")
PtFlow<-Sankey
PtFlow<-data.frame(PtFlow)
names(PtFlow)<-c("ord1","ord2","ord3","ord4","ord5","ord6","ord7","ord8","ord9","ord10","ord11","ord12")
orders <- PtFlow %>%
select(ord1, ord2, ord3, ord4, ord5,ord6,ord7,ord8,ord9,ord10,ord11,ord12)
#Create another data frame for the function
orders.plot <- data.frame()
data.frame(from= character(0), to= character(0), n = numeric(0))
This works fine up to this point. However I have a problem with the following:
for (i in 2:ncol(orders)) {
ord.cache <- orders %>%
group_by(orders[ , i-1], orders[ , i]) %>%
summarise(n=n())
colnames(ord.cache)[1:2] <- c('from', 'to')
# adding tags to carts
ord.cache$from <- paste(ord.cache$from, '(', i-1, ')', sep='')
ord.cache$to <- paste(ord.cache$to, '(', i, ')', sep='')
orders.plot <- rbind(orders.plot, ord.cache)
}
When I get to the function, RStudio always crashes and I can't even get a error to debug. If I run the loop bit by bit I think the error is at the rbind but I can't be sure.

Related

Running a loop is paunfully very slow in R

I have a script that calculates the copy number variation and saves the data into an existing file named "resultCNV.txt" based on first column information.
Here is my script
setwd("./Data")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("./Basefile/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
for(i in 1:36){
df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <- makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/Basefile/genesforcomp.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("resultCNV.txt", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="resultCNV.txt", quote = F, sep = "\t", row.names = F)
}
Here is my data for testing Link. It has two folders: base folder: for once reading and Data: for data.
In the last 4 line, I am using full_join function of tidyverse, to add the analyzed column into the last saved output based on the first column value (Ensembl_ID). I am running ~200 file each time and it takes almost 2 hours, while running 100 files takes just 30 minutes (hyperbolic curve in a time vs no. of loop). With each loop, output file size reduces to the original like 900kb and then increase with each cycle like 5 mb then 11 mb, and so on.
Can it is possible to reduce time i.e. not reading the last saved output and just merging the column based on the first column?
Any suggestions or ideas of how to loop the script will be appreciated.
Thanks in advance!
When I think my loops are too slow I use apply method instead. In your case it would be something like this:
e = function(i){
df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <- makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
}
lapply(1:4376, e)
In many of my analysis this saved a lot of time for me, I hope it will work as well with yours.
Little bonus, to estimate the time of the lapply thing you can use instead pblapply() from the pbapply package.
I hope this helped you
The data.table library fread and fwrite is good in this context. It reduces the time by ~80%. The overall performance of data.table::fread / fwrite (27 sec) is better than readr (93 sec) or read.delim (145 sec). I think it is acceptable.

Speeding up dplyr pipe including checks with mutate_if and if_else on larger tables

I wrote some code to performed oversampling, meaning that I replicate my observations in a data.frame and add noise to the replicates, so they are not exactly the same anymore. I'm quite happy that it works now as intended, but...it is too slow. I'm just learning dplyr and have no clue about data.table, but I hope there is a way to improve my function. I'm running this code in a function for 100s of data.frames which may contain about 10,000 columns and 400 rows.
This is some toy data:
library(tidyverse)
train_set1 <- rep(0, 300)
train_set2 <- rep("Factor1", 300)
train_set3 <- data.frame(replicate(1000, sample(0:1, 300, rep = TRUE)))
train_set <- cbind(train_set1, train_set2, train_set3)
row.names(train_set) <- c(paste("Sample", c(1:nrow(train_set)), sep = "_"))
This is the code to replicate each row a given number of times and a function to determine whether the added noise later will be positive or negative:
# replicate each row twice, added row.names contain a "."
train_oversampled <- train_set[rep(seq_len(nrow(train_set)), each = 3), ]
# create a flip function
flip <- function() {
sample(c(-1,1), 1)
}
In the relevant "too slow" piece of code, I'm subsetting the row.names for the added "." to filter for the replicates. Than I select only the numeric columns. I go through those columns row by row and leave the values untouched if they are 0. If not, a certain amount is added (here +- 1 %). Later on, I combine this data set with the original data set and have my oversampled data.frame.
# add percentage of noise to non-zero values in numerical columns
noised_copies <- train_oversampled %>%
rownames_to_column(var = "rowname") %>%
filter(grepl("\\.", row.names(train_oversampled))) %>%
rowwise() %>%
mutate_if(~ is.numeric(.), ~ if_else(. == 0, 0,. + (. * flip() * 0.01 ))) %>%
ungroup() %>%
column_to_rownames(var = "rowname")
# combine original and oversampled, noised data set
train_noised <- rbind(noised_copies, train_set)
I assume there are faster ways using e.g. data.table, but it was already tough work to get this code running and I have no idea how to improve its performance.
EDIT:
The solution is working perfectly fine with fixed values, but called within a for loop I receive "Error in paste(Sample, n, sep = ".") : object 'Sample' not found"
Code to replicate:
library(data.table)
train_set <- data.frame(
x = c(rep(0, 10)),
y = c(0:9),
z = c(rep("Factor1", 10)))
# changing the row name to avoid confusion with "Sample"
row.names(train_set) <- c(paste("Observation", c(1:nrow(train_set)), sep = "_"))
train_list <- list(aa = train_set, bb = train_set, cc = train_set)
for(current_table in train_list) {
setDT(current_table, keep.rownames="Sample")
cols <- names(current_table)[sapply(current_table, is.numeric)]
noised_copies <- lapply(c(1,2), function(n) {
copy(current_table)[,
c("Sample", cols) := c(.(paste(Sample, n, sep=".")),
.SD * sample(c(-1.01, 1.01), .N*ncol(.SD), TRUE)),
.SDcols=cols]
})
train_noised <- rbindlist(c(noised_copies, list(train_set)), use.names=FALSE)
# As this is an example, I did not write anything to actually
# store the results, so I have to remove the object
rm(train_noised)
}
Any ideas why the column Sample can't be found now?
Here is a more vectorized approach using data.table:
library(data.table)
setDT(train_set, keep.rownames="Sample")
cols <- names(train_set)[sapply(train_set, is.numeric)]
noised_copies <- lapply(c(1,2), function(n) {
copy(train_set)[,
c("Sample", cols) := c(.(paste(Sample, n, sep=".")),
.SD * sample(c(-1.01, 1.01), .N*ncol(.SD), TRUE)),
.SDcols=cols]
})
train_noised <- rbindlist(c(noised_copies, list(train_set)), use.names=FALSE)
With data.table version >= 1.12.9, you can pass is.numeric directly to .SDcols argument and maybe a shorter way (e.g. (.SD) or names(.SD)) to pass to the left hand side of :=
address OP's updated post:
The issue is that although each data.frame within the list is converted to a data.table, the train_list is not updated. You can update the list with a left bind before the for loop:
library(data.table)
train_set <- data.frame(
x = c(rep(0, 10)),
y = c(0:9),
z = c(rep("Factor1", 10)))
# changing the row name to avoid confusion with "Sample"
row.names(train_set) <- c(paste("Observation", c(1:nrow(train_set)), sep = "_"))
train_list <- list(aa = train_set, bb = copy(train_set), cc = copy(train_set))
train_list <- lapply(train_list, setDT, keep.rownames="Sample")
for(current_table in train_list) {
cols <- names(current_table)[sapply(current_table, is.numeric)]
noised_copies <- lapply(c(1,2), function(n) {
copy(current_table)[,
c("Sample", cols) := c(.(paste(Sample, n, sep=".")),
.SD * sample(c(-1.01, 1.01), .N*ncol(.SD), TRUE)),
.SDcols=cols]
})
train_noised <- rbindlist(c(noised_copies, train_list), use.names=FALSE)
# As this is an example, I did not write anything to actually
# store the results, so I have to remove the object
rm(train_noised)
}

Apply a user defined function to a list of data frames

I have a series of data frames structured similarly to this:
df <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',11:21))
df2 <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',50:60))
In order to clean them I wrote a user defined function with a set of cleaning steps:
clean <- function(df){
colnames(df) <- df[2,]
df <- df[grep('^[0-9]{4}', df$year),]
return(df)
}
I'd now like to put my data frames in a list:
df_list <- list(df,df2)
and clean them all at once. I tried
lapply(df_list, clean)
and
for(df in df_list){
clean(df)
}
But with both methods I get the error:
Error in df[2, ] : incorrect number of dimensions
What's causing this error and how can I fix it? Is my approach to this problem wrong?
You are close, but there is one problem in code. Since you have text in your dataframe's columns, the columns are created as factors and not characters. Thus your column naming does not provide the expected result.
#need to specify strings to factors as false
df <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',11:21), stringsAsFactors = FALSE)
df2 <- data.frame(x = c('notes','year',1995:2005), y = c(NA,'value',50:60), stringsAsFactors = FALSE)
clean <- function(df){
colnames(df) <- df[2,]
#need to specify the column to select the rows
df <- df[grep('^[0-9]{4}', df$year),]
#convert the columns to numeric values
df[, 1:ncol(df)] <- apply(df[, 1:ncol(df)], 2, as.numeric)
return(df)
}
df_list <- list(df,df2)
lapply(df_list, clean)

Fast and Efficient Way to merge a lot of dataframe in R

I have a large set of dataframes (around 50,000). Each dataframe have two columns, key and value, with around 100-200 rows. My question is essentially similar to this and this. Following their ideas, I construct a list of dataframes and use Reduce function
freq_martix<-Reduce(function(dtf1, dtf2) merge(dtf1, dtf2, by = "key", all = TRUE),
freq_list)
But my code has run for several days. I just wonder if there is a more efficient, faster way to merge a large set of dataframes?
This way is pretty fast.
First of all I created 500 tables, each containing 150 key-value pairs.
library(data.table)
library(stringi)
for (i in 1:500) {
set.seed(i)
dfNam <- paste('df', i, sep = '_')
df <- data.frame( cbind(key = tolower(stri_rand_strings(150, 1, pattern = '[A-Za-z]')), value = sample(1:1000, 150, replace = TRUE)) )
assign(dfNam, df)
rm(df)
rm(dfNam)
}
Then I transposed and append them:
tmp <- data.table()
for (i in ls(pattern = 'df_') ) {
df <- get(i)
dt <- data.table( transpose(df) )
colnames(dt) <- as.character(unlist(dt[1, ]))
dt <- dt[-1, ]
tmp <- rbindlist(list(tmp, dt), use.names = TRUE, fill = TRUE)
}
And transposed back after all:
merged_data <- transpose(tmp)
key <- colnames(tmp)
merged_data <- cbind(key, merged_data)
Works like charm.

Subsetting a spacetime::SDFDF by time

How can I subset a spacetime::SDFDF (spatio-temporal data with full space-time grid) by time?
Sofar, I tried:
library("maps")
library("maptools")
library("spacetime")
library("plm")
states.m <- map("state", plot = FALSE, fill = TRUE)
IDs <- sapply(strsplit(states.m$names, ":"), function(x) x[1])
states <- map2SpatialPolygons(states.m, IDs = IDs)
yrs <- 1970:1986
time <- as.POSIXct(paste(yrs, "-01-01", sep = ""), tz = "GMT")
data("Produc")
Produc.st <- STFDF(states[-8], time, Produc[order(Produc[2], Produc[1]),])
Produc.st#time[c(1,5,17)]
Produc.st[Produc.st#time[c(1,5,17)]]
But that gives me the error: ncol(i) == 2 is not TRUE.
Any ideas?
Please try
Produc.st[,index(Produc.st#time[c(1,5,17)])]
i.e., time selection is done after the ,, and don't select with an xts object as Produc.st#time[c(1,5,17)]) is, but with a time (POSIXct) vector.

Resources