Random sampling based on vector with multiple conditions R - r

I have a large dataframe SYN_data with 150000 rows and 3 columns named SNP, Gene and count.There is a list r with 2545 count values that also include some duplicates. Now I need to randomly sample 2545 rows without replacement from SYN_data with similar count values as in the list r. I could successfully do it until here by using this code:
test1 <- SYN_data[ sample( which( SYN_data$count %in% r ) , 2545 ) , ]
The second condition is that the unique length of Genes should be 1671 in total 2545 rows, means that some of the Genes have more than 1 SNPs. Is there any way I can incorporate this condition in the same code or any other code meeting all conditions would be very helpful. Thanks!
Sample data:
# list
r
> 1,7,3,14,9
SYN_data$SNP <- c('1- 10068526', '1- 10129891', '1- 10200104',
'1- 10200491', '1- 10470141', '1- 10671598')
SYN_data$Gene <- c('AT1G28640', 'AT1G29030', 'AT1G29180',
'AT1G29180', 'AT1G29900', 'AT1G30290')
SYN_data$count <- c('14', '9', '3', '3', '7', '1')

Try using the following :
library(dplyr)
no_of_rows <- 2545
no_of_unique_gene <- 1671
temp <- SYN_data
while(n_distinct(temp$Gene) != no_of_unique_gene) {
gene <- sample(unique(SYN_data$Gene),no_of_unique_gene)
temp <- SYN_data[SYN_data$V23 %in% unique(r) & SYN_data$Gene %in% gene, ]
}
part1 <- temp %>% group_by(Gene) %>% sample_n(floor(no_of_rows/no_of_unique_gene))
part2 <- temp %>% anti_join(part1) %>% sample_n(no_of_rows - nrow(part1))
final <- bind_rows(part1, part2)
and now check length(unique(final$Gene)).

An possible approach is to sample 1671 unique genes first, subset the dataset to those that share those genes and has count in the set of r. Here is an implementation of this approach in data.table:
#had to create some dummy data as not clear what the data is like
set.seed(0L)
nr <- 15e4
nSNP <- 1e3
nGene <- 1e4
ncount <- 1:14
r <- c(1,3,7,9,14)
SYN_data <- data.table(SNP=sample(nSNP, nr, TRUE),
Gene=sample(nGene, nr, TRUE), count=sample(ncount, nr, TRUE))
ncnt <- 2545
ng <- 1671
#sample 1671 genes
g <- SYN_data[, sample(unique(Gene), ng)]
#subset and sample the dataset
ix <- SYN_data[Gene %in% g & count %in% r, sample(.I, 1L), Gene]$V1
ans <- rbindlist(list(
SYN_data[ix],
SYN_data[-ix][Gene %in% g & count %in% r][, .SD[sample(.I, ncnt - ng)]]))
ans[, uniqueN(Gene)]
#1662 #not enough Gene in this dummy dataset
output:
SNP Gene count
1: 816 1261 14
2: 7 8635 1
3: 132 7457 1
4: 22 3625 3
5: 396 7640 7
---
2534: 423 6387 3
2535: 936 3908 7
2536: 346 9654 14
2537: 182 7492 3
2538: 645 635 1

Related

wilcox.test() on many combinations of columns

I have a data.table with approximately 400 columns and 800,000 rows. The columns represent samples and the rows represent CpG sites. Example data here:
require(data.table)
samples <- replicate(200,replicate(1000,runif(1)))
cpgs <- paste0('cpg',1:1000)
n <- c('cpg',paste0('sample',1:200))
data <- data.table(cbind(cpgs,samples))
colnames(data) <- n
I want to run a wilcox.test() on randomly selected columns of this data 1000 times. I've currently implemented this the following way, but it's very slow on large numbers of permutations.
cases <- paste0('sample',1:10)
controls <- paste0('sample',30:40)
data[,wilcox_p:=wilcox.test( as.numeric(.SD[,mget(cases)]), as.numeric(.SD[,mget(controls)]) )$p.value,by=cpg]
Is there a more efficient way to do this? My complete use case, where getCpGSites() is the function described above, is here:
iterations_vec <- 1:1000
labels <- paste0('sample',1:200)
permutations <- foreach(i = iterations_vec, .combine='rbind', .multicombine = TRUE ) %dopar% {
case_labels <- sample(labels,num_cases,replace=FALSE)
control_labels <- labels[!labels %in% case_labels]
signature_cpgs <- getCpGSites(case_labels,control_labels)
num_signature_cpgs <- length(signature_cpgs)
out <- data.table('gene' = gene,
'iteration' = i,
'num_signature_cpgs' = num_signature_cpgs)
return(out)
}
Here's one approach, based on the tidyverse. First, convert all your character data tonumeric, rtaher than delegating to your function.
library(tidyverse)
numericData <- data %>% mutate(across(where(is.character), as.numeric))
Now write a function to perform a Wilcoxon test on a randomly selected pair of columns
randomWilcox <- function(d) {
cols <- sample(2:ncol(d), size=2, replace=FALSE)
d1 <- d %>% select(cpg, all_of(cols))
tibble(
col1=cols[1],
col2=cols[2],
p.value=wilcox.test(d1 %>% pull(2), d1 %>% pull(3))$p.value
)
}
Now use lapply to run the function 1000 times, with a very crude measure of speed:
startTime <- Sys.time()
lapply(1:1000, function(x) numericData %>% randomWilcox) %>% bind_rows()
endTime <- Sys.time()
# A tibble: 1,000 × 3
col1 col2 p.value
<int> <int> <dbl>
1 15 172 0.124
2 26 58 0.202
3 200 60 0.840
4 124 94 0.344
5 180 200 0.723
6 122 155 0.987
7 122 174 0.173
8 83 146 0.921
9 135 95 0.0605
10 168 174 0.0206
# … with 990 more rows
Each row of the output tibble contains the indices of the columns selected, and the p-value obtained from corresponding wilcox.test.
The time taken is about 13 seconds on my machine. Is that quick enough?
endTime - startTime
Time difference of 13.1156 secs
Edit
Removing the intermediate data frame reduces the time taken to just over none seconds:
randomWilcox <- function(d) {
cols <- sample(2:ncol(d), size=2, replace=FALSE)
tibble(
col1=cols[1],
col2=cols[2],
p.value=wilcox.test(d %>% pull(cols[1]), d %>% pull(cols[2]))$p.value
)
}

Rerun same code but changing colums names at each loop

I have a script in r to calculate body condition residuals. I would like to apply this code to each columns, which correspond to a specific category of individual.
For example i would run this code to calculate body condition residuals of all individuals that are in the category 1
1) Select rows of interest
Data1 = RawData %>% select(ID,temperature, Bodysize1, mass1, year) %>% filter((temperature %in% c(20:29) & Bodysize1 %in% c(20:100) & mass1 %in% c(15:40))
2) Create a new model with created data
Model1 =lmer(log(mass1) ~ log(Bodysize1) + temperature + (1|year), data = Data1)
3) Extract residuals and add ID to the residuals
ResModel1 = resid(Model1)
ID=Data1$ID
Res1 =data.frame(ResModel1 ,ID)
4) Add residuals to my RawData
RawData2.0 = merge(RawData, Res1, by = c("ID"), all.x = T)
In order to avoid reruning this code and manually changing all the 1 by 2 and then all the 2 by 3... etc is there a way to do this commande automatic whith loops and the apply familly?
My data
ID TEMPERATURE BODYSIZE1 MASS1 BODYSIZE2 MASS2 YEAR
81-012 0.03830645 200 1450 205 1425 1981
84-069 0.26923078 200 1473 205 1498 1984
84-134 0.32692307 209 1448 195 1323 1984
84-145 0.27884614 197 1373 197 1498 1984
84-190 0.31129807 191 1248 195 1323 1984
85-155 0.33056709 198 1637 229 1988 1985
Thanks in advance
Withou example data it is tough to say if this will work but maybe creating a function can simplify your workflow
library(tidyverse)
get_resid <- function(df,filters) {
df_to_model <- df %>% filter({{filters}})
df_to_keep <- df <- filter({{filters}},.preserve = FALSE)
Data1 <- df_to_model %>%
select(ID,temperature, Bodysize1, mass1, year)
Model1 <- lmer(log(mass1) ~ log(Bodysize1) + temperature + (1|year), data = Data1)
ResModel1 <- resid(Model1)
ID <- Data1$ID
Res1 <- data.frame(ResModel1 ,ID)
Res1 %>%
bind_rows(df_to_keep)
}
Then you may use this this function in your pipes
RawData %>%
get_resid(temperature %in% c(20:29) & Bodysize1 %in% c(20:100) & mass1 %in% c(15:40))
You might try to i) create a tibble, in the first column list all dep variables as strings,
ii) list your models of indep vars in the second column,
iii) create a formula in the third column
iv) run your model in the fourth column
df <- tibble(dep = paste0("log(var",seq(1,10,1),")"),
x = "~ your_x_vars") %>%
mutate(formula = as.formula(paste0(dep,x))) %>%
mutate(reg = map(formula, ~lm(as.formula(.x), data=df) ))
then you can easily extract the residuals

subsetting the matching value (complete row) with an interval between two or more csv files

I have two matrix (A and B). I am trying to subset the matching rows from B with an interval value. For example,
Matrix A contains (I have more than 200 compounds)
Name Mass. RT. Area. ID
Asa. 234.032 1.56. 6755. Sd323
bda 164.041. 4.48. 5353. SD424
dsf. 353.953. 6.53. 2535. SD422
fed. 535.535. 5.14. 4542 SD424
Matrix B contains (similarly original matrix or CSV contains 5000 compounds)
Name. mass. RT Area. chemID pubID score
csa. 234.031 1.56. 4354. frsg. gss. 90
bda. 164.041. 4.78. 4346. gsdg gsf. 80
dwf. 432.035. 9.84. 4245. grhr. hfg. 99
fsf. 535.042. 7.01. 5353. heth. gww. 90
Now I want to subset the matching compounds from matrix B using Mass ± 0.001 and RT ± 0.5 interval and final matrix look like
Name. mass. RT Area. chemID pubID score
csa. 234.031 1.56. 4354. frsg. gss. 90
bda. 164.041. 4.78. 4346. gsdg gsf. 80
I tried with following commands in R and didnt work well. Any help is really appreciated.
#Read in first table
fname = "A.csv"
df1 = read.csv(fname)
# Read in the second table
fname = "B.xlsx"
df2 = read_excel(fname, skip=4)
# Create an empy dataframe
new_df = setNames(data.frame(matrix(ncol = ncol(df2), nrow = 0)), colnames(df2))
# Set the threshold for the mass and the retention time
m_ths = 1.e-3 # Mass threshold
rt_ths = 0.5 # Retention time threshold
# Loop over the indices of one of the data frames
for (i in 1:nrow(df1)) {
# Get the mass and retention time of the current row
m = df1$Mass[i]
rt = df1$RT[i]
# Get boolean vectors of rows within the second table that are within the
# given tolerance of the current mass (m) and retention time (rt)
m_cond = df2$Mass >= m-m_ths & df2$Mass <= m+m_ths
rt_cond = df2$RT >= rt-rt_ths & df2$RT <= rt + rt_ths
# Get the subset of rows in second table that meet the required conditions
tmp_df = subset(df2, m_cond & rt_cond)
if (nrow(tmp_df) > 0) {
# If the new table is not empty add it to the empty new_df data frame
tmp_df$mb_data_index = i
new_df = rbind(new_df, tmp_df)
}
}
write.csv(new_df, "commoncompounds.csv")
Code:
library('data.table')
# join two data tables and get only the matching rows by Name
df3 <- setDT(df2)[df1, on = 'Name', nomatch = 0]
# subset based on conditions of Mass and RT
df3 <- df3[ (round(abs(Mass - i.Mass), 3) <= 0.001) &
(round(abs(RT - i.RT), 1) <= 0.5), ]
# remove columns of df1
df3[, `:=` (i.Mass = NULL, i.RT = NULL, i.Area = NULL, ID = NULL)]
df3
# Name Mass RT Area chemID pubID score
# 1: Asa 234.031 1.56 4354 frsg gss 90
# 2: bda 164.041 4.78 4346 gsdg gsf 80
Data:
df1 <- read.table(text =
'Name Mass RT Area ID
Asa 234.032 1.56 6755 Sd323
bda 164.041 4.48 5353 SD424
dsf 353.953 6.53 2535 SD422
fed 535.535 5.14 4542 SD424', header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = 'Name Mass RT Area chemID pubID score
Asa 234.031 1.56 4354 frsg gss 90
bda 164.041 4.78 4346 gsdg gsf 80
dwf 432.035 9.84 4245 grhr hfg 99
fsf 535.042 7.01 5353 heth gww 90', header = TRUE, stringsAsFactors = FALSE)

R: Search multiple columns for factors

I have a large dataframe with multiple columns (about 150).
There is a range of columns (Dx1, Dx2..until Dx30) which are diagnosis codes (the codes are numbers, but they are categorical variables that correspond to a medical diagnosis using the ICD-9 coding system).
I have working code to search a single column, but need to search all 30 columns to see if any of the columns contain a code within the specified range (DXrange).
The core dataframe looks like:
Case DX1 DX2 DX3 DX4...DX30
1 123 345 567 99 12
2 234 345 NA NA NA
3 456 567 789 345 34
Here is the working code:
## Defines a range of codes to search for
DXrange <- factor(41000:41091, levels = levels(core$DX1))
## Search for the DXrange codes in column DX1.
core$IndexEvent <- core$DX1 %in% DXrange & substr(core$DX1, 5, 5) != 2
## What is the frequency of the IndexEvent?
cat("Frequency of IndexEvent : \n"); table(core$IndexEvent)
The working code is adapted from "Calculating Nationwide Readmissions Database (NRD) Variances, Report # 2017-01"
I could run this for each DX column and then sum them for a final IndexEvent total, but this is not very efficient.
I would first normalize my data, before searching in the codes, such as the following example:
set.seed(314)
df <- data.frame(id = 1:5,
DX1 = sample(1:10,5),
DX2 = sample(1:10,5),
DX3 = sample(1:10,5))
require(dplyr)
require(tidyr)
df %>%
gather(key,value,-id) %>%
filter(value %in% 1:2)
or with just base R
df.long <- do.call(rbind,lapply(df[,2:4],function(x) data.frame(id = df$id, DX = x)))
df.long[df.long$DX %in% 1:2, ]
We could use filter_at with any_vars
df %>%
filter_at(vars(matches("DX\\d+")), any_vars(. %in% DXrange))
where
DXrange <- 41000:41091

how to remove outliers from cooks distance in R

I am writing a generic function which takes dataframe and column name and return the clean dataframe without outliers in R
cooks_dist <- function(dataframe,column){
dataframe <- dataframe %>% select_if(dataframe,is.numeric)
mod <- lm(column ~ ., data=dataframe)
cooksd <- cooks.distance(mod)
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd,na.rm=T))]) # influential row numbers
final <- dataframe[-influential,]
return(final)
}
But,when I run this function it says Error: Can't convert a list to function
Data can be found at
http://ucanalytics.com/blogs/wp-content/uploads/2016/09/Regression-Clean-Data.csv
The error originated from dplyr::select_if(). I believe you want a subset of all numeric columns so you alternatively could create a subset with sapply(). Note: As your lm() line produced errors, I`ve inserted the minimal model instead.
So I think you want this:
cooks_dist <- function(dataframe, column){
dataframe <- dataframe[, sapply(dataframe, is.numeric)]
mod <- lm(dataframe[, column] ~ 1, data = dataframe)
cooksd <- cooks.distance(mod)
influential <- as.numeric(names(cooksd)[(cooksd > 4 * mean(cooksd, na.rm = TRUE))])
final <- dataframe[-influential, ]
return(final)
}
df1 <- cooks_dist(df1, 4)
Yields:
> head(df1)
X Observation Dist_Taxi Dist_Market Dist_Hospital Carpet Builtup Rainfall House_Price
2 2 2 8294 8186 12694 1461 1752 210 3982000
3 3 3 11001 14399 16991 1340 1609 720 5401000
4 4 4 8301 11188 12289 1451 1748 620 5373000
5 5 5 10510 12629 13921 1770 2111 450 4662000
7 7 7 13153 11869 17811 1542 1858 1030 7224000
8 8 8 5882 9948 13315 1261 1507 1020 3772000
I used this code, with threshold for cooks as 4/n:
orig.mod <- lm(Outcome ~ Exposure, data=origdf)
origdf$cooksd <- cooks.distance(orig.mod)
origdf$cookyn <- ifelse(origdf$cooksd < 4/nrow(orig.dat), "keep","no")
minus.df <-subset(origdf, cookyn=="keep")
newmod.minuscooks <- lm(Outcome ~ Exposure, data=minus.df)

Resources