I have two data files as below:
head (RNA)
Gene_ID chr start end
1 ENSG00000000003.1 X 99883667 99884983
2 ENSG00000000003.2 X 99885756 99885863
3 ENSG00000000003.3 X 99887482 99887565
4 ENSG00000000003.4 X 99888402 99888536
5 ENSG00000000003.5 X 99888928 99889026
6 ENSG00000000003.6 X 99890175 99890249
head(snp)
chr start end SNP_No
1 1 58812 58812 SNP_1
2 1 67230 67230 SNP_2
3 1 79529 79529 SNP_3
4 1 79595 79595 SNP_4
5 1 85665 85665 SNP_5
6 1 86064 86064 SNP_6
I would like to find overlap between snp file and RNA file, so I used GenomicRanges R package and I have done below commands:
gr_RNA <- GRanges(seqnames=RNA$chr,IRanges(start=RNA$start,end=RNA$end,names=RNA$Gene_ID))
gr_SNP <- GRanges(seqnames=SNP$chr, IRanges(start=SNP$start,end=SNP$end,names=SNP$SNP_No))
overlaps <- findOverlaps(gr_RNA, gr_SNP)
subsetByOver <- subsetByOverlaps(gr_RNA, gr_SNP)
match_hit <- data.frame(names(gr_RNA)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
names(match_hit) <- c('Gene_ID','SNP')
head(match_hit)
Gene_ID SNP
1 ENSG00000000457.1 SNP_307301
2 ENSG00000000457.2 SNP_307307
3 ENSG00000000457.11 SNP_307365
4 ENSG00000000457.12 SNP_307387
5 ENSG00000000460.1 SNP_306845
6 ENSG00000000460.1 SNP_306846
dim(match_hit)
[1] 12287 2
Then I expanded distance for start and end position from RNA file ("start-100" and "end+100")and run scripts again as below:
gr_RNA1 <- GRanges(seqnames=RNA$chr, IRanges(start=(RNA$start)-100, end=(RNA$end)+100, names=RNA$Gene_ID))
overlaps <- findOverlaps(gr_RNA1, gr_SNP)
subsetByOver<-subsetByOverlaps(gr_RNA1, gr_SNP)
match_hit1 <- data.frame(names(gr_RNA1)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
dim(match_hit1)
[1] 17976 2
Now, I want to implement a function which takes the RNA table, the SNP table, and the expand distance, then give me final results.
Functions in R are defined like this:
myFunction <- function(parameters) {
#function Code
return(result)
}
see also
Related
I’m currently following a differential transcript usage (DTU) analysis tutorial (link here) and am using the sample datasets provided by the authors. However, my results stop matching those from the tutorial after I create a dmDSdata object and filter it (I’ve included the code below). Creating the object works fine, but after filtering and estimating model parameters, the results tables I produce show different genes and transcripts from the ones shown in the tutorial:
# Load the DRIMSeq package and create a dmDSdata object with the
# counts and samples data frames
library(DRIMSeq)
dmDS <- dmDSdata(counts = counts, samples = samples)
dmDS # returns information about the number of genes
# Each row of the dmDSdata object contains all the transcripts corresponding
# to a particular gene
methods(class = class(dmDS))
counts(dmDS[1,])[,1:4]
# Filter the dmDS object before estimating model parameters
n <- 12 # the total number of samples
n.small <- 6 # sample size of the smallest group
dmDS <- dmFilter(dmDS,
min_samps_feature_expr = n.small, min_feature_expr = 10 ,
min_samps_feature_prop = n.small, min_feature_prop = 0.1,
min_samps_gene_expr = n, min_gene_expr = 10)
dmDS
# Find out how many of the genes remaining after filtering have N isoforms
# by counting the number of unique gene IDs and tabulating the results
table(table(counts(dmDS)$gene_id))
# Create a design matrix using a design formula as well as the sample
# information contained in the dmDS object (accessed via samples.csv)
design_full <- model.matrix(~condition, data = DRIMSeq::samples(dmDS))
colnames(design_full)
# To accelerate the subsequent steps, subset to the first 250 genes
dmDS <- dmDS[1:250,]
# Estimating model parameters and testing for differential transcript use
# Estimate the precision, which is inversely related to dispersion in the
# Dirichlet Multinomial model
# Fit regression coefficients
# Perform null hypothesis testing on the coefficient of interest
set.seed(1)
system.time({
dmDS <- dmPrecision(dmDS, design = design_full )
dmDS <- dmFit (dmDS, design = design_full )
dmDS <- dmTest (dmDS, coef = "condition2")
})
# Tabulate the results, including a p-value per gene or a p-value per transcript
# p-value per gene: is there DTU within this gene?
# p-value per transcript: has the proportion of this transcript changed within
# its parent gene?
results <- DRIMSeq::results(dmDS) # per gene
results.txp <- DRIMSeq::results(dmDS, level = "feature") # per transcript
At this point, the results I should get are as follows:
head(results)
## gene_id lr df pvalue adj_pvalue
## 1 ENSG00000000457.13 1.493561 4 8.277814e-01 9.120246e-01
## 2 ENSG00000000460.16 1.068294 3 7.847330e-01 9.101892e-01
## 3 ENSG00000000938.12 4.366806 2 1.126575e-01 2.750169e-01
## 4 ENSG00000001084.11 1.630085 3 6.525877e-01 8.643316e-01
## 5 ENSG00000001167.14 28.402587 1 9.853354e-08 5.007113e-07
## 6 ENSG00000001461.16 9.815460 1 1.730510e-03 6.732766e-03
head(results.txp)
## gene_id feature_id lr df pvalue adj_pvalue
## 1 ENSG00000000457.13 ENST00000367771.10 0.16587607 1 0.6838032 0.9171007
## 2 ENSG00000000457.13 ENST00000367770.5 0.01666448 1 0.8972856 0.9788571
## 3 ENSG00000000457.13 ENST00000367772.8 1.02668495 1 0.3109386 0.6667146
## 4 ENSG00000000457.13 ENST00000423670.1 0.06046507 1 0.8057624 0.9323782
## 5 ENSG00000000457.13 ENST00000470238.1 0.28905766 1 0.5908250 0.8713427
## 6 ENSG00000000460.16 ENST00000496973.5 0.83415788 1 0.3610730 0.7232298
However, what I see in the R console is the following:
head(results)
## gene_id lr df pvalue adj_pvalue
## 1 ENSG00000237094.12 52.9721358 1 3.383138e-13 2.532227e-12
## 2 ENSG00000237491.8 2.7403807 1 9.784145e-02 3.179847e-01
## 3 ENSG00000228794.8 6.9271154 2 3.131814e-02 1.330626e-01
## 4 ENSG00000187961.13 0.9699384 2 6.157162e-01 8.934371e-01
## 5 ENSG00000217801.9 0.2262070 1 6.343506e-01 8.934371e-01
## 6 ENSG00000131591.17 30.4292202 1 3.462727e-08 2.136131e-07
head(results.txp)
## gene_id feature_id lr df pvalue adj_pvalue
## 1 ENSG00000237094.12 ENST00000599771.6 52.9721358 1 3.383138e-13 3.341499e-12
## 2 ENSG00000237094.12 ENST00000608420.1 52.9721358 1 3.383138e-13 3.341499e-12
## 3 ENSG00000237491.8 ENST00000585826.1 2.7403807 1 9.784145e-02 3.528888e-01
## 4 ENSG00000237491.8 ENST00000592547.1 2.7403807 1 9.784145e-02 3.528888e-01
## 5 ENSG00000228794.8 ENST00000445118.6 0.4788971 1 4.889223e-01 8.378376e-01
## 6 ENSG00000228794.8 ENST00000449005.5 0.5862693 1 4.438654e-01 8.201190e-01
I have tried switching from R version 4.1 and Bioconductor version 13.3 to the older ones used in the tutorial, but I got error messages when trying to download the rnaseqDTU package which said it was not available to older versions of Bioconductor. As I use RStudio, I also tried clearing my global environment and running the code again, but that did not work either. I’m not sure what to do about this issue and would appreciate any potential solutions! Thanks.
I would like to create a for loop to count if the values in each row are larger than a cutoff value that changes from row to row in another matrix. Currently, my code looks like this:
for (i in 100) {
count_Q4_l2 = NULL #set to zero after every loop
for (j in 10000){
if (ACT_Allquant2[1,i]>cc[j,1]){ #if the value in this column larger than the other, then count
count_Q4_l2 <- count_Q4_l2+1 #+1 to count the values
}
}
countALL[1,i] <- count_Q4_l2 #save the values into another data.frame
}
}
The cutoff values are in the ACT_Allquant2 table and they should move forward together with the for loop.
Hope I explained myself clearly and I thank you very much in advance for your help!!
EDIT:
ACT_Allquant2 looks the following way:
X91. X92. X93. X94. X95. X96. X97. X98.
Qfourfac_netlg2 0.7685364 0.8995720 0.9896079 1.014982 1.066362 1.229381
X99. X100.
Qfourfac_netlg2 1.727864 2.318737
While cc is a series of column
X1. X2. X3. X4. X5. X6. X7. X8. X9.
2 -2.504816 -2.433826 -2.305134 -2.261871 -2.110741 -1.894405 -1.344805 -1.256876 -1.211396
X10. X11. X12. X13. X14. X15. X16. X17.
2 -1.199943 -1.13323 -1.031908 -1.019844 -1.007079 -0.9932806 -0.9232708 -0.8316696
X18. X19. X20. X21. X22. X23. X24. X25.
2 -0.8052391 -0.7738284 -0.7334976 -0.7126213 -0.6950152 -0.6272749 -0.584775 -0.5540359
X26. X27. X28. X29. X30. X31. X32. X33.
2 -0.5307423 -0.5105184 -0.4107709 -0.4001571 -0.3959766 -0.3607601 -0.329242 -0.2746449
X34. X35. X36. X37. X38. X39. X40. X41.
2 -0.2231775 -0.1799284 -0.1684765 -0.1568755 -0.1446923 -0.1403811 -0.1387818 -0.126637
X42. X43. X44. X45. X46. X47. X48. X49.
2 -0.1082471 -0.08882241 -0.053299 -0.04695731 0.002623936 0.05961787 0.07482258 0.0868524
X50. X51. X52. X53. X54. X55. X56. X57. X58.
2 0.09455113 0.1003998 0.1077676 0.1574778 0.1810591 0.1832488 0.1874931 0.1893803 0.1955026
X59. X60. X61. X62. X63. X64. X65. X66. X67.
2 0.2035948 0.2321749 0.2453042 0.2604033 0.2739561 0.3018942 0.3835822 0.5748584 0.603411
X68. X69. X70. X71. X72. X73. X74. X75. X76.
2 0.6580565 0.6882143 0.7104922 0.7568134 0.7769822 0.7932305 0.8550466 0.876781 1.084851
X77. X78. X79. X80. X81. X82. X83. X84. X85. X86.
2 1.117067 1.196249 1.261902 1.310987 1.423575 1.485869 1.606687 1.678782 1.950923 1.995428
X87. X88. X89. X90. X91. X92. X93. X94. X95. X96.
2 1.99818 2.04422 2.080644 2.205811 2.21738 2.356354 2.469436 2.484198 2.52253 2.564173
X97. X98. X99.
2 2.638286 2.675248 2.768761
I'm not sure I understand, but let's try a simple example:
set.seed(41)
ACT <- data.frame(matrix(rnorm(100), 25, 4))
cc <- rnorm(4, 0, .5)
cc
# [1] 0.03641331 0.59785494 -1.05581599 0.33569523
In each column of ACT you want to count the values that exceed the value in cc, e.g. for column 1 the number that exceed 0.03641331, for column 2 the number that exceed 0.59785494? If that is so, you do not need any loops:
Comp <- sweep(ACT, 2, cc, ">")
Count <- colSums(Comp)
Count
# X1 X2 X3 X4
# 16 8 22 10
You can extract the values that exceed the cc value for each column, but you cannot put them into a data frame since the number of values in each column is different. You can create a data frame with the coordinates of the larger values or a list with the values for each column:
Larger <- data.frame(which(Comp, arr.ind=TRUE), ACT[Comp])
head(Larger)
# row col ACT.Comp.
# 1 2 1 0.1972575
# 2 3 1 1.0017043
# 3 4 1 1.2888254
# 4 5 1 0.9057534
# 5 6 1 0.4936675
# 6 7 1 0.5992858
LargerByCol <- split(Larger$ACT.Comp, Larger$col)
LargerByCol[[1]]
# [1] 0.1972575 1.0017043 1.2888254 0.9057534 0.4936675 0.5992858 . . . 16 values
I have a small issue regarding a dataset I am using. Suppose I have a dataset called mergedData2 defined using those command lines from a subset of mergedData:
mergedData=rbind(test_set,training_set)
lookformean<-grep("mean()",names(mergedData),fixed=TRUE)
lookforstd<-grep("std()",names(mergedData),fixed=TRUE)
varsofinterests<-sort(c(lookformean,lookforstd))
mergedData2<-mergedData[,c(1:2,varsofinterests)]
If I do names(mergedData2), I get:
[1] "volunteer_identifier" "type_of_experiment"
[3] "body_acceleration_mean()-X" "body_acceleration_mean()-Y"
[5] "body_acceleration_mean()-Z" "body_acceleration_std()-X"
(I takes this 6 first names as MWE but I have a vector of 68 names)
Now, suppose I want to take the average of each of the measurements per volunteer_identifier and type_of_experiment. For this, I used a combination of split and lapply:
mylist<-split(mergedData2,list(mergedData2$volunteer_identifier,mergedData2$type_of_experiment))
average_activities<-lapply(mylist,function(x) colMeans(x))
average_dataset<-t(as.data.frame(average_activities))
As average_activities is a list, I converted it into a data frame and transposed this data frame to keep the same format as mergedData and mergedData2. The problem now is the following: when I call names(average_dataset), it returns NULL !! But, more strangely, when I do:head(average_dataset) ; it returns :
volunteer_identifier type_of_experiment body_acceleration_mean()-X body_acceleration_mean()-Y
1 1 0.2773308 -0.01738382
2 1 0.2764266 -0.01859492
3 1 0.2755675 -0.01717678
4 1 0.2785820 -0.01483995
5 1 0.2778423 -0.01728503
6 1 0.2836589 -0.01689542
This is just a small sample of the output, to say that the names of the variables are there. So why names(average_dataset) returns NULL ?
Thanks in advance for your reply, best
EDIT: Here is an MWE for mergedData2:
volunteer_identifier type_of_experiment body_acceleration_mean()-X body_acceleration_mean()-Y
1 2 5 0.2571778 -0.02328523
2 2 5 0.2860267 -0.01316336
3 2 5 0.2754848 -0.02605042
4 2 5 0.2702982 -0.03261387
5 2 5 0.2748330 -0.02784779
6 2 5 0.2792199 -0.01862040
body_acceleration_mean()-Z body_acceleration_std()-X body_acceleration_std()-Y body_acceleration_std()-Z
1 -0.01465376 -0.9384040 -0.9200908 -0.6676833
2 -0.11908252 -0.9754147 -0.9674579 -0.9449582
3 -0.11815167 -0.9938190 -0.9699255 -0.9627480
4 -0.11752018 -0.9947428 -0.9732676 -0.9670907
5 -0.12952716 -0.9938525 -0.9674455 -0.9782950
6 -0.11390197 -0.9944552 -0.9704169 -0.9653163
gravity_acceleration_mean()-X gravity_acceleration_mean()-Y gravity_acceleration_mean()-Z
1 0.9364893 -0.2827192 0.1152882
2 0.9274036 -0.2892151 0.1525683
3 0.9299150 -0.2875128 0.1460856
4 0.9288814 -0.2933958 0.1429259
5 0.9265997 -0.3029609 0.1383067
6 0.9256632 -0.3089397 0.1305608
gravity_acceleration_std()-X gravity_acceleration_std()-Y gravity_acceleration_std()-Z
1 -0.9254273 -0.9370141 -0.5642884
2 -0.9890571 -0.9838872 -0.9647811
3 -0.9959365 -0.9882505 -0.9815796
4 -0.9931392 -0.9704192 -0.9915917
5 -0.9955746 -0.9709604 -0.9680853
6 -0.9988423 -0.9907387 -0.9712319
My duty is to get this average_dataset (which is a dataset which contains the average value for each physical quantity (column 3 and onwards) for each volunteer and type of experiment (e.g 1 1 mean1 mean2 mean3...mean68
2 1 mean1 mean2 mean3...mean68, etc)
After this I will have to export it as a txt file (so I think using write.table with row.names=F, and col.names=T). Note that for now, if I do this and import the dataset generated using read.table, I don't recover the names of the columns of the dataset; even while specifying col.names=T.
Hi this is a sample of data.frame / list with two columns containing X and Y. And my problem is when I call subset it will cut decimal part. Can you help me figure why?
(row.names | X | Y)
> var
...
9150 4246838.57 5785639.07
9152 4462019.15 5756344.11
9153 4671745.07 5791092.53
9154 4825699.93 5767058.37
9155 4935126.99 5839357.55
> typeof(var)
[1] "list"
> var = subset(var, Y>10980116 & X>3217133)
...
6569 15163607 11323070
6572 15102381 11079465
6573 16462260 11272569
6577 19028175 11095784
It's the same when I use:
> var = var[var$Y>10980116 & var$X>3217133,]
Thank you for your help.
This is not a subsetting issue, it's a formatting/presentation issue. You're in the first circle of Burns's R Inferno ("[i]f you are using R and you think you’re in hell, this is a map for you"):
another aspect of virtuous pagan beliefs—what is printed is all
that there is
If we just print this bit of the data frame exactly as entered, we "lose" digits.
> df <- read.table(text="
4246838.57 5785639.07
4462019.15 5756344.11
4671745.07 5791092.53
4825699.93 5767058.37
4935126.99 5839357.55",
header=FALSE)
> df
## V1 V2
## 1 4246839 5785639
## 2 4462019 5756344
## 3 4671745 5791093
## 4 4825700 5767058
## 5 4935127 5839358
Tell R you want to see more precision:
> print(df,digits=10)
## V1 V2
## 1 4246838.57 5785639.07
## 2 4462019.15 5756344.11
## 3 4671745.07 5791092.53
## 4 4825699.93 5767058.37
## 5 4935126.99 5839357.55
Or you can set options(digits=10) (the default is 7).
I'm trying to create a .csv or .txt file from a data.frame object that also contains some lines describing more detail about the variables.
Here's my first attempt:
Head <- "
#variables:
#sal - Salinity [PSU]
#temp - Temperature [degrees Celsius]
"
n <- 10
df <- data.frame(sal=runif(n, 30, 37), temp=runif(n, 15, 17))
df
sink("data.txt")
Head
df
sink()
which results in this:
[1] "\n#variables [units]:\n#sal - Salinity [PSU]\n#temp - Temperature [degrees Celcius]\n"
sal temp
1 32.11494 15.35176
2 30.57537 16.80972
3 32.90651 15.95174
4 30.62192 15.73436
5 31.43069 15.45873
6 34.38173 15.69713
7 31.27954 15.01126
8 32.77093 16.22493
9 35.99510 15.10123
10 35.52409 15.49084
but, I would really like it to look like this:
#variables [units]:
#sal - Salinity [PSU]
#temp - Temperature [degrees Celcius]
sal temp
1 32.11494 15.35176
2 30.57537 16.80972
3 32.90651 15.95174
4 30.62192 15.73436
5 31.43069 15.45873
6 34.38173 15.69713
7 31.27954 15.01126
8 32.77093 16.22493
9 35.99510 15.10123
10 35.52409 15.49084
Use cat instead of letting R call the object's print method.
sink("data.txt")
cat(Head)
df
sink()