My data was 450K (DNA methylation data). And the results below are from regional analysis. It contains three columns: the chromosome number, start position, and end position:
region <- structure(list(chr = c(2L, 2L, 2L, 3L, 4L, 5L, 5L, 5L, 6L, 6L, 7L, 8L, 10L, 11L, 12L, 15L, 16L, 18L, 18L, 21L, 22L), start = c(95663987L, 80531500L, 154334651L, 24536765L, 187476837L, 16179633L, 2751822L, 63461803L, 133562246L, 29521568L, 49813031L, 24772270L, 128593922L, 30038286L, 6649733L, 65913660L, 51184152L, 6414602L, 5543801L, 22370347L, 24890330L), end = c(95664360L, 80531899L, 154334652L, 24537302L, 187476838L, 16180267L, 2752602L, 63461931L, 133562777L, 29521715L, 49813487L, 24772351L, 128594418L, 30038311L, 6649995L, 65913661L, 51184887L, 6415253L, 5543946L, 22370759L, 24891142L)), class = "data.frame", row.names = c(4L, 12L, 15L, 14L, 20L,8L, 10L, 18L, 1L, 16L, 5L, 6L, 2L, 21L, 9L, 17L, 13L, 7L, 19L, 11L, 3L))
The distribution in my region is:
table(region$chr)
The first chromosome is chr2, chich contains four regions here.
Now I have another probe file, which contains probes with their chromosomes and positions. What I want to do is to extract the probes that are lociated in my target regions. Here is probe file:
probe <- structure(list(chr = c(6L, 12L, 16L, 1L, 13L, 17L, 16L, 13L, 3L, 17L, 20L, 8L, 12L, 17L, 8L, 6L, 15L, 16L, 16L, 16L, 6L, 1L, 7L, 18L, 2L, 8L, 16L, 10L, 11L, 12L, 1L, 15L, 1L, 11L, 13L, 13L, 6L, 6L, 9L, 12L, 1L, 12L, 13L, 13L, 6L, 1L, 2L, 3L, 11L, 22L, 15L, 11L, 19L, 19L, 1L, 6L, 10L, 3L, 4L, 17L, 10L, 8L, 6L, 2L, 8L, 16L, 1L, 2L, 16L, 9L, 6L, 19L, 10L, 4L, 4L, 17L, 11L, 4L, 1L, 1L, 5L, 3L, 12L, 16L, 7L, 11L, 4L, 6L, 19L, 14L, 17L, 1L, 4L, 7L, 11L, 5L, 5L, 2L, 2L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), pos = c(159064992L, 114367005L, 28835671L, 200003800L, 42692969L, 73780663L, 65236094L, 114057675L, 23713773L, 56326765L, 44142512L, 103668081L, 111806472L, 4437077L, 8871457L, 143771621L, 29993498L, 696801L, 79623625L, 69385761L, 30685686L, 76190435L, 14031049L, 3732002L, 32853151L, 146233339L, 71757240L, 131844944L, 128424176L, 89749142L, 27693242L, 57138252L, 43123399L, 57407842L, 29067224L, 53191387L, 30921630L, 107971593L, 125133314L, 109915400L, 46668882L, 14720858L, 67804654L, 23500367L, 170398571L, 150241781L, 85843232L, 15106710L, 33758223L, 44350860L, 83726483L, 76814245L, 3789435L, 55013663L, 166846008L, 150289488L, 3187835L, 169684620L, 1340602L, 35297146L, 61569177L, 122954569L, 71276472L, 9563665L, 9952926L, 81040735L, 15392793L, 55183957L, 27228679L, 139334396L, 44090748L, 3979938L, 125425262L, 10687769L, 503198L, 55191642L, 19735701L, 184244831L, 10738664L, 17446073L, 140739501L, 49384054L, 56618196L, 71324066L, 27221689L, 8041137L, 149033953L, 169224907L, 3933591L, 76450658L, 46152449L, 93250590L, 1025591L, 37024552L, 1360335L, 156277860L, 157098423L, 85980756L, 2575755L, 142138643L, 80531898L, 80531597L, 80531656L, 95664233L, 95664359L, 95664243L, 80531645L, 80531599L, 80531500L, 80531842L, 95663987L, 80531751L, 154334651L, 80531633L)), row.names = c("cg13598865", "cg02666265", "cg16662787", "cg10513702", "cg10970751", "cg08536977", "cg09084496", "cg08794696", "cg18648917", "cg20272962", "cg03013946", "cg07028608", "cg10361696", "cg06618629", "cg25307778", "cg00888489", "cg21092551", "cg07760369", "cg04317962", "cg08627125", "cg18512512", "cg13901901", "cg13524180", "cg18761756", "cg23633993", "cg07013148", "cg06190759", "cg14070745", "cg11552868", "cg26635451", "cg03201274", "cg25063425", "cg04482817", "cg05082527", "cg24850711", "cg25194273", "cg18964706", "cg01485362", "cg14154487", "cg22511293", "cg01431908", "cg20219035", "cg18855836", "cg06743703", "cg07489447", "cg16269716", "cg12737876", "cg00001245", "cg24871046", "cg07065008", "cg02104456", "cg13466901", "cg17880816", "cg23352067", "cg26870903", "cg12489846", "cg04144333", "cg02399652", "cg24269412", "cg03146993", "cg17307051", "cg20129534", "cg07968224", "cg07814910", "cg02192555", "cg07629951", "cg13322252", "cg18456312", "cg02871891", "cg07874283", "cg26371345", "cg07663404", "cg07036530", "cg17677988", "cg16619777", "cg25182165", "cg20686479", "cg04184793", "cg22513691", "cg17183414", "cg04246144", "cg05383531", "cg25245322", "cg02244933", "cg05516617", "cg11111132", "cg07760722", "cg05357093", "cg08248181", "cg00780666", "cg26932693", "cg14681854", "cg23853026", "cg08044454", "cg22317004", "cg05907764", "cg05482973", "cg03128635", "cg01968492", "cg03460049", "cg00465284", "cg00549910", "cg02856109", "cg03445516", "cg06816651", "cg09409539", "cg09482777", "cg11231249", "cg12078605", "cg21621248", "cg24871414", "cg26355577", "cg26649384", "cg27629977"), class = "data.frame")
Below was what I tried: extracted probes chromosome by chromosome, and region by region. Let's take the chr2 for example.
chr2 %>% probe %>% subset(chr==2) %>% subset(pos >= 95663987 & pos <= 95664360 | pos >= 80531500 & pos <= 80531899 | pos >= 154334651 & pos <= 154334652)
It worked well and showed 14 probes that are located in these four regions. However, my real region file have many more regions whitin each chromosome. It will be time comsuming to put all the "start" and "end" number in the code. So I want to have a easier code to extract the probes, at least chromosome by chromosome.
Below was what I tried:
chr2.df <- probe %>% subset(chr==2) %>% subset(pos >= region$start & pos <= region$end)
It showed no regions...
Can anyone help me with it - how to extract the probes not by using the detail "start" and "end" number in the region file?
Thank you so much.
If your goal is to identify probes that lie in each chromosomal region, then I think that this code will suffice:
library(magrittr)
pdf <- tibble::as_tibble(probe ) %>% dplyr::mutate(probe = rownames(probe))
region %>%
tibble::as_tibble() %>%
dplyr::left_join(pdf, by = "chr") %>%
dplyr::filter(pos < end, pos > start)
I first load the package magrittr, which lets me use the "pipe" function, %>%. I then create a tibble (a data frame) with probe as a (new) column. This reflects my preference to not use rownames with data frames.
I then convert region to a tibble (a type of data frame) before piping it to the left_join function from dplyr package. This function "merges" or "joins" the two data frames by common values of "chr". Since there are repeated values of "chr" in both region and pdf, we get multiple lines with, for example, "chr" value of 2.
Lastly, I use the function filter from dplyr to choose only those rows that have a pos value between start and end.
I hope that this helps.
I've worked with the openair and hexbin packages to create two scatter plots with the help of the scatter plot function commands:
scatterPlot(mydata, x ="Observed" , y = "Model1",xlab=10, ylab=10,method = "hexbin",mod.line=T,auto.text=F, col = "jet", xbin = 30)
scatterPlot(mydata, x ="Observed" , y = "Model2",xlab=10, ylab=10,method = "hexbin",mod.line=T,auto.text=F, col = "jet", xbin = 30)
I've got the scatter plots, but if I want to put them into one plot and with one color counts to get something similar to this:How should i proceed?
please refer to this link to view the image : https://ibb.co/rF148kp
You could reorganize your data frame so that it has three columns - "Observed", "Modeled", and "Model Type". Example -
structure(list(observed = c(2L, 2L, 4L, 4L, 6L, 6L, 8L, 8L, 10L,
10L, 12L, 12L, 14L, 14L, 16L, 16L, 18L, 18L, 20L, 20L), modelled = c(1L,
5L, 7L, 2L, 5L, 9L, 13L, 15L, 16L, 14L, 18L, 17L, 10L, 21L, 26L,
24L, 22L, 28L, 27L, 30L), model_type = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Model 1", "Model 2"), class = "factor")), class = "data.frame",
row.names = c(NA,
-20L))
This way, you can then use the following code -
scatterPlot(mydata, x = "observed", y = "modelled", type = c("model_type"),
method = "hexbin",mod.line=T,auto.text=F, col = "jet", xbin = 5,
linear = TRUE, layout = c(2, 1))
To create a plot containing the two scatter plots. Note, the above code sets xbin to 5 purely for the reason that I have used a small data set for testing purposes. Also, excuse the spelling error in the y-axis and code ("modelled" should be "modeled")!
I have the following data frame summary created with dplyr
structure(list(maxrep = c(7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L,
18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L,
24L, 26L, 26L), div = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Premier Division",
"Second Division"), class = "factor"), freq = c(1L, 10L, 4L,
39L, 26L, 89L, 73L, 146L, 107L, 162L, 117L, 133L, 121L, 125L,
116L, 91L, 110L, 65L, 95L, 43L, 75L, 38L, 43L, 24L, 38L, 16L,
36L, 5L, 15L, 2L, 9L, 7L, 9L, 1L, 3L, 3L, 2L, 1L)), .Names = c("maxrep",
"div", "freq"), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -38L))
My intention is to use ggplot2 to plot line graphs of 2 lines with different colour with text labels for each value.
What I did was
ggplot(df, aes(x=maxrep, y=freq, colour=div)) +
geom_line() +
geom_text(aes(label=freq), vjust=-.5)
The result was
Now my question: All the labels in the chart are above the points in respective lines. I want to have the labels for the different colours to be in different relative position, e.g. labels for cyan above the line, and labels for red below the line (i.e. variable vjust). Is there a way to do that?
Also, is there a way to get read of the letter a in the colour legend on the right?
What about plotting the lines separately wich differing vjust values? You can get rid of a in the legend setting show_guide = FALSE.
ggplot(df, aes(x=maxrep, y=freq, colour=div, label = freq)) +
geom_line() +
geom_text(data = df[df$div == "Second Division",], vjust=2, show_guide = FALSE) + geom_text(data = df[df$div == "Premier Division",], vjust=-2, show_guide = FALSE)
Which returns:
Create a new variable in the data.frame holding the vjust adjustment parameter:
df$pos <- c(2, -2)[(df$div == "Premier Division")+1]
And you could call vjust inside aes with the new pos vector:
ggplot(df, aes(x=maxrep, y=freq, colour=div)) +
geom_line() +
geom_text(aes(label=freq, vjust=pos))
I want to do the looping for the following data. The output for a single iteration is a data.frame. My code is:
Data <- structure(list(v = c(15L, 15L, 15L, 15L, 16L, 16L, 16L, 17L,
17L, 18L, 19L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 25L, 25L
), b = c(35L, 70L, 42L, 35L, 20L, 48L, 16L, 68L, 68L, 51L, 57L,
57L, 57L, 95L, 76L, 70L, 21L, 77L, 77L, 100L, 30L), r = c(7L,
14L, 14L, 14L, 5L, 15L, 6L, 16L, 20L, 17L, 9L, 12L, 18L, 19L,
19L, 10L, 5L, 14L, 21L, 12L, 6L), k = c(3L, 3L, 5L, 6L, 4L, 5L,
6L, 4L, 5L, 6L, 3L, 4L, 6L, 4L, 5L, 3L, 5L, 4L, 6L, 3L, 5L),
lambda = c(1L, 2L, 4L, 5L, 1L, 4L, 2L, 3L, 5L, 5L, 1L, 2L,
5L, 3L, 4L, 1L, 1L, 2L, 5L, 1L, 1L)), .Names = c("v", "b",
"r", "k", "lambda"), class = "data.frame", row.names = c(NA,
-21L))
library(AlgDesign)
BIB <- list()
for(i in 1:nrow(Data)){
BIB[[i]] <- data.frame(optBlock(~., withinData = factor(1:Data[i, "v"]), blocksize = rep(Data[i, "k"], Data[i, "b"]))$Blocks)
dimnames(BIB[[i]]) <- list(1:Data[i, "k"], paste("Block", 1:Data[i, "b"], sep = " "))
}
BIB
Is there an easy way to accomplish the same task?
BIB <- list()
for(i in 1:nrow(Data)){
BIB[[i]] <- data.frame(optBlock(~., withinData = factor(1:Data[i, "v"]), blocksize = rep(Data[i, "k"], Data[i, "b"]))$Blocks)
dimnames(BIB[[i]]) <- list(1:Data[i, "k"], paste("Block", 1:Data[i, "b"], sep = "_"))
}
print(BIB)