I have a data.table with approximately 400 columns and 800,000 rows. The columns represent samples and the rows represent CpG sites. Example data here:
require(data.table)
samples <- replicate(200,replicate(1000,runif(1)))
cpgs <- paste0('cpg',1:1000)
n <- c('cpg',paste0('sample',1:200))
data <- data.table(cbind(cpgs,samples))
colnames(data) <- n
I want to run a wilcox.test() on randomly selected columns of this data 1000 times. I've currently implemented this the following way, but it's very slow on large numbers of permutations.
cases <- paste0('sample',1:10)
controls <- paste0('sample',30:40)
data[,wilcox_p:=wilcox.test( as.numeric(.SD[,mget(cases)]), as.numeric(.SD[,mget(controls)]) )$p.value,by=cpg]
Is there a more efficient way to do this? My complete use case, where getCpGSites() is the function described above, is here:
iterations_vec <- 1:1000
labels <- paste0('sample',1:200)
permutations <- foreach(i = iterations_vec, .combine='rbind', .multicombine = TRUE ) %dopar% {
case_labels <- sample(labels,num_cases,replace=FALSE)
control_labels <- labels[!labels %in% case_labels]
signature_cpgs <- getCpGSites(case_labels,control_labels)
num_signature_cpgs <- length(signature_cpgs)
out <- data.table('gene' = gene,
'iteration' = i,
'num_signature_cpgs' = num_signature_cpgs)
return(out)
}
Here's one approach, based on the tidyverse. First, convert all your character data tonumeric, rtaher than delegating to your function.
library(tidyverse)
numericData <- data %>% mutate(across(where(is.character), as.numeric))
Now write a function to perform a Wilcoxon test on a randomly selected pair of columns
randomWilcox <- function(d) {
cols <- sample(2:ncol(d), size=2, replace=FALSE)
d1 <- d %>% select(cpg, all_of(cols))
tibble(
col1=cols[1],
col2=cols[2],
p.value=wilcox.test(d1 %>% pull(2), d1 %>% pull(3))$p.value
)
}
Now use lapply to run the function 1000 times, with a very crude measure of speed:
startTime <- Sys.time()
lapply(1:1000, function(x) numericData %>% randomWilcox) %>% bind_rows()
endTime <- Sys.time()
# A tibble: 1,000 × 3
col1 col2 p.value
<int> <int> <dbl>
1 15 172 0.124
2 26 58 0.202
3 200 60 0.840
4 124 94 0.344
5 180 200 0.723
6 122 155 0.987
7 122 174 0.173
8 83 146 0.921
9 135 95 0.0605
10 168 174 0.0206
# … with 990 more rows
Each row of the output tibble contains the indices of the columns selected, and the p-value obtained from corresponding wilcox.test.
The time taken is about 13 seconds on my machine. Is that quick enough?
endTime - startTime
Time difference of 13.1156 secs
Edit
Removing the intermediate data frame reduces the time taken to just over none seconds:
randomWilcox <- function(d) {
cols <- sample(2:ncol(d), size=2, replace=FALSE)
tibble(
col1=cols[1],
col2=cols[2],
p.value=wilcox.test(d %>% pull(cols[1]), d %>% pull(cols[2]))$p.value
)
}
Related
I have a temperature profiler (tp) data for date, depth and temperature. The depth for each date is not exactly the same so I need to unify it to the same depth and set the temperature for that depth by linear approximation. I was able to do this with a loop using ‘approx’ function (see first part of the enclosed code). But I know that I should do it better without a loop (considering I will have about 600,000 rows). I tried to do it with ‘by’ function but was not successful transforming the result (list) into a data frame or matrix (see second part of the code).
Keep in mind that length of the rounded depth is not always the same as in the example.
Rounded depth is in Depth2 column, interpulated temperature is put in Temp2
What is the ‘right’ way to solve this?
# create df manually
tp <- data.frame(Date=double(31), Depth=double(31), Temperature=double(31))
tp$Date[1:11] <- '2009-12-17' ; tp$Date[12:22] <- '2009-12-18'; tp$Date[23:31] <- '2009-12-19'
tp$Depth <- c(24.92,25.50,25.88,26.33,26.92,27.41,27.93,28.37,28.82,29.38,29.92,25.07,25.56,26.06,26.54,27.04,27.53,28.03,28.52,29.02,29.50,30.01,25.05,25.55,26.04,26.53,27.02,27.52,28.01,28.53,29.01)
tp$Temperature <- c(19.08,19.06,19.06,18.87,18.67,17.27,16.53,16.43,16.30,16.26,16.22,17.62,17.43,17.11,16.72,16.38,16.28,16.20,16.15,16.13,16.11,16.08,17.54,17.43,17.32,17.14,16.89,16.53,16.28,16.20,16.13)
# create rounded depth column
tp$Depth2 <- round(tp$Depth)
# loop on date to calculate linear approximation for rounded depth
dtgrp <- tp[!duplicated(tp[,1]),1]
for (i in dtgrp) {
x1 <- tp[tp$Date == i, "Depth"]
y1 <- tp[tp$Date == i, "Temperature"]
x2 <- tp[tp$Date == i, "Depth2"]
tpa <- approx(x=x1,y=y1,xout=x2, rule=2)
tp[tp$Date == i, "Temp2"] <- tpa$y
}
# reduce result to rounded depth
tp1 <- tp[!duplicated(tp[,-c(2:3)]),-c(2:3)]
# not part of the question, but the end need is for a matrix, so this complete it:
library(reshape2)
tpbydt <- acast(tp1, Date~Depth2, value.var="Temp2")
# second part: I tried to use the by function (instead of loop) but got lost when tring to convert it to data frame or matrix
rdpth <- function(x1,y1,x2) {
tpa <- approx(x=x1,y=y1,xout=x2, rule=2)
return(tpa)
}
tp2 <- by(tp, tp$Date,function(tp) rdpth(tp$Depth,tp$Temperature,tp$Depth2), simplify = TRUE)
Very close with by call but remember it returns a list of objects. Therefore, consider building a list of data frames to be row binded at very end:
df_list <- by(tp, tp$Date, function(sub) {
tpa <- approx(x=sub$Depth, y=sub$Temperature, xout=sub$Depth2, rule=2)
df <- unique(data.frame(Date = sub$Date,
Depth2 = sub$Depth2,
Temp2 = tpa$y,
stringsAsFactors = FALSE))
return(df)
})
tp2 <- do.call(rbind, unname(df_list))
tp2
# Date Depth2 Temp2
# 1 2009-12-17 25 19.07724
# 2 2009-12-17 26 19.00933
# 5 2009-12-17 27 18.44143
# 7 2009-12-17 28 16.51409
# 9 2009-12-17 29 16.28714
# 11 2009-12-17 30 16.22000
# 12 2009-12-18 25 17.62000
# 21 2009-12-18 26 17.14840
# 4 2009-12-18 27 16.40720
# 6 2009-12-18 28 16.20480
# 8 2009-12-18 29 16.13080
# 10 2009-12-18 30 16.08059
# 13 2009-12-19 25 17.54000
# 22 2009-12-19 26 17.32898
# 41 2009-12-19 27 16.90020
# 61 2009-12-19 28 16.28510
# 81 2009-12-19 29 16.13146
And if you reset row.names, this is exactly identical to your tp1 output:
identical(data.frame(tp1, row.names = NULL),
data.frame(tp2, row.names = NULL))
# [1] TRUE
I have a large dataframe SYN_data with 150000 rows and 3 columns named SNP, Gene and count.There is a list r with 2545 count values that also include some duplicates. Now I need to randomly sample 2545 rows without replacement from SYN_data with similar count values as in the list r. I could successfully do it until here by using this code:
test1 <- SYN_data[ sample( which( SYN_data$count %in% r ) , 2545 ) , ]
The second condition is that the unique length of Genes should be 1671 in total 2545 rows, means that some of the Genes have more than 1 SNPs. Is there any way I can incorporate this condition in the same code or any other code meeting all conditions would be very helpful. Thanks!
Sample data:
# list
r
> 1,7,3,14,9
SYN_data$SNP <- c('1- 10068526', '1- 10129891', '1- 10200104',
'1- 10200491', '1- 10470141', '1- 10671598')
SYN_data$Gene <- c('AT1G28640', 'AT1G29030', 'AT1G29180',
'AT1G29180', 'AT1G29900', 'AT1G30290')
SYN_data$count <- c('14', '9', '3', '3', '7', '1')
Try using the following :
library(dplyr)
no_of_rows <- 2545
no_of_unique_gene <- 1671
temp <- SYN_data
while(n_distinct(temp$Gene) != no_of_unique_gene) {
gene <- sample(unique(SYN_data$Gene),no_of_unique_gene)
temp <- SYN_data[SYN_data$V23 %in% unique(r) & SYN_data$Gene %in% gene, ]
}
part1 <- temp %>% group_by(Gene) %>% sample_n(floor(no_of_rows/no_of_unique_gene))
part2 <- temp %>% anti_join(part1) %>% sample_n(no_of_rows - nrow(part1))
final <- bind_rows(part1, part2)
and now check length(unique(final$Gene)).
An possible approach is to sample 1671 unique genes first, subset the dataset to those that share those genes and has count in the set of r. Here is an implementation of this approach in data.table:
#had to create some dummy data as not clear what the data is like
set.seed(0L)
nr <- 15e4
nSNP <- 1e3
nGene <- 1e4
ncount <- 1:14
r <- c(1,3,7,9,14)
SYN_data <- data.table(SNP=sample(nSNP, nr, TRUE),
Gene=sample(nGene, nr, TRUE), count=sample(ncount, nr, TRUE))
ncnt <- 2545
ng <- 1671
#sample 1671 genes
g <- SYN_data[, sample(unique(Gene), ng)]
#subset and sample the dataset
ix <- SYN_data[Gene %in% g & count %in% r, sample(.I, 1L), Gene]$V1
ans <- rbindlist(list(
SYN_data[ix],
SYN_data[-ix][Gene %in% g & count %in% r][, .SD[sample(.I, ncnt - ng)]]))
ans[, uniqueN(Gene)]
#1662 #not enough Gene in this dummy dataset
output:
SNP Gene count
1: 816 1261 14
2: 7 8635 1
3: 132 7457 1
4: 22 3625 3
5: 396 7640 7
---
2534: 423 6387 3
2535: 936 3908 7
2536: 346 9654 14
2537: 182 7492 3
2538: 645 635 1
I would like to process some GPS-Data rows, pairwise.
For now, I am doing it in a normal for-loop but I'm sure there is a better and faster way.
n = 100
testdata <- as.data.frame(cbind(runif(n,1,10), runif(n,0,360), runif(n,14,16), runif(n, 46,49)))
colnames(testdata) <- c("speed", "heading", "long", "lat")
head(testdata)
diffmatrix <- as.data.frame(matrix(ncol = 3, nrow = dim(testdata)[1] - 1))
colnames(diffmatrix) <- c("distance","heading_diff","speed_diff")
for (i in 1:(dim(testdata)[1] - 1)) {
diffmatrix[i,1] <- spDists(as.matrix(testdata[i:(i+1),c('long','lat')]),
longlat = T, segments = T)*1000
diffmatrix[i,2] <- testdata[i+1,]$heading - testdata[i,]$heading
diffmatrix[i,3] <- testdata[i+1,]$speed - testdata[i,]$speed
}
head(diffmatrix)
How would i do that with an apply-function?
Or is it even possible to do that calclulation in parallel?
Thank you very much!
I'm not sure what you want to do with the end condition but with dplyr you can do all of this without using a for loop.
library(dplyr)
testdata %>% mutate(heading_diff = c(diff(heading),0),
speed_diff = c(diff(speed),0),
longdiff = c(diff(long),0),
latdiff = c(diff(lat),0))
%>% rowwise()
%>% mutate(spdist = spDists(cbind(c(long,long + longdiff),c(lat,lat +latdiff)),longlat = T, segments = T)*1000 )
%>% select(heading_diff,speed_diff,distance = spdist)
# heading_diff speed_diff distance
# <dbl> <dbl> <dbl>
# 1 15.9 0.107 326496
# 2 -345 -4.64 55184
# 3 124 -1.16 25256
# 4 85.6 5.24 221885
# 5 53.1 -2.23 17599
# 6 -184 2.33 225746
I will explain each part below:
The pipe operator %>% is essentially a chain that sends the results from one operation into the next. So we start with your test data and send it to the mutate function.
Use mutate to create 4 new columns that are the difference measurements from one row to the next. Adding in 0 at the last row because there is no measurement following the last datapoint. (Could do something like NA instead)
Next once you have the differences you want to use rowwise so you can apply the spDists function to each row.
Last we create another column with mutate that calls the original 4 columns that we created earlier.
To get only the 3 columns that you were concerned with I used a select statement at the end. You can leave this out if you want the entire dataframe.
I am writing a generic function which takes dataframe and column name and return the clean dataframe without outliers in R
cooks_dist <- function(dataframe,column){
dataframe <- dataframe %>% select_if(dataframe,is.numeric)
mod <- lm(column ~ ., data=dataframe)
cooksd <- cooks.distance(mod)
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd,na.rm=T))]) # influential row numbers
final <- dataframe[-influential,]
return(final)
}
But,when I run this function it says Error: Can't convert a list to function
Data can be found at
http://ucanalytics.com/blogs/wp-content/uploads/2016/09/Regression-Clean-Data.csv
The error originated from dplyr::select_if(). I believe you want a subset of all numeric columns so you alternatively could create a subset with sapply(). Note: As your lm() line produced errors, I`ve inserted the minimal model instead.
So I think you want this:
cooks_dist <- function(dataframe, column){
dataframe <- dataframe[, sapply(dataframe, is.numeric)]
mod <- lm(dataframe[, column] ~ 1, data = dataframe)
cooksd <- cooks.distance(mod)
influential <- as.numeric(names(cooksd)[(cooksd > 4 * mean(cooksd, na.rm = TRUE))])
final <- dataframe[-influential, ]
return(final)
}
df1 <- cooks_dist(df1, 4)
Yields:
> head(df1)
X Observation Dist_Taxi Dist_Market Dist_Hospital Carpet Builtup Rainfall House_Price
2 2 2 8294 8186 12694 1461 1752 210 3982000
3 3 3 11001 14399 16991 1340 1609 720 5401000
4 4 4 8301 11188 12289 1451 1748 620 5373000
5 5 5 10510 12629 13921 1770 2111 450 4662000
7 7 7 13153 11869 17811 1542 1858 1030 7224000
8 8 8 5882 9948 13315 1261 1507 1020 3772000
I used this code, with threshold for cooks as 4/n:
orig.mod <- lm(Outcome ~ Exposure, data=origdf)
origdf$cooksd <- cooks.distance(orig.mod)
origdf$cookyn <- ifelse(origdf$cooksd < 4/nrow(orig.dat), "keep","no")
minus.df <-subset(origdf, cookyn=="keep")
newmod.minuscooks <- lm(Outcome ~ Exposure, data=minus.df)
I did a replication 10000 times where I took a random sample from a list of ID's and then paired them with another list of IDs. After that I added a colomn that gives the relatedness of pair to each other. Then I took thee mean of the relatedness for each set of random sampling. So I end up with 10000 values which represent the mean of the relatedness for each set of random sampling. However, I want to instead take the mean of the relatedness of first row for all the 10000 sets of random sampling.
An example of what I want:
Lets say I have 10000 sets of 3 random pairings.
Set 1
female_ID male_ID relatedness
0 12-34 23-65 0.034
1 44-62 56-24 0.56
2 76-11 34-22 0.044
Set 2
female_ID male_ID relatedness
0 98-54 53-12 0.022
1 22-43 13-99 0.065
2 09-22 65-22 0.12
etc...
I want the mean of the rows for relatedness of each set, so I want a list of 3 values: 0.028 (mean of 0.034 and 0.022), 0.3125 (mean of 0.56 and 0.065), 0.082 (mean of 0.044 and 0.12), except it would be the mean across 10000 sets, and not just 2.
Here's my code so far:
mean_rel <- replicate(10000, {
random_mal <- sample(list_of_males, 78, replace=TRUE)
random_pair <- cbind(list_of_females, random_mal)
random_pair <- data.frame(random_pair)
random_pair$pair <- with(random_pair, paste(list_of_females, random_mal, sep = " "))
typeA <- genome$rel[match(random_pair$pair, genome_year$pair1)]
typeB <- genome$rel[match(random_pair$pair, genome_year$pair2)]
random_pair$relatedness <- ifelse(is.na(typeA), typeB, typeA)
random_pair <- na.omit(random_pair)
mean_random_pair_relatedness <- mean(random_pair$relatedness)
mean_random_pair_relatedness
})
If you add simplify = FALSE to your replicate after the between the closing } and ), then mean_rel will be output as a list.
mean_rel <- replicate(10000, {
random_mal <- sample(list_of_males, 78, replace=TRUE)
random_pair <- cbind(list_of_females, random_mal)
random_pair <- data.frame(random_pair)
random_pair$pair <- with(random_pair, paste(list_of_females, random_mal, sep = " "))
typeA <- genome$rel[match(random_pair$pair, genome_year$pair1)]
typeB <- genome$rel[match(random_pair$pair, genome_year$pair2)]
random_pair$relatedness <- ifelse(is.na(typeA), typeB, typeA)
random_pair <- na.omit(random_pair)
mean_random_pair_relatedness <- mean(random_pair$relatedness)
mean_random_pair_relatedness
}, simplify = FALSE)
From there, you can use purrr to add two classification columns and then can use dplyr for the rest. Here is how I did it:
library(tidyverse)
mean_rel <- purrr::map2(.x = mean_rel, .y = seq_along(mean_rel),
function(x, y){
x %>%
mutate(set = paste0("set_", y)) %>%
# do this so the same row of each set can be
# compared
rownames_to_column(var = "row_number")
})
mean_rel_comb <- mean_rel %>%
do.call(rbind, .) %>%
as.tibble() %>%
mutate(relatedness = as.numeric(as.character(relatedness))) %>%
group_by(row_number) %>%
summarize(mean = mean(relatedness))
Using your two datasets combined as a list gave me this:
# A tibble: 3 x 2
row_number mean
<chr> <dbl>
1 1 0.0280
2 2 0.3125
3 3 0.0820