Related
I have a manhattan plot of genetic information:
It was generated using the qqman package https://cran.r-project.org/web/packages/qqman/vignettes/qqman.html) in R which takes a dataframe of P-values, chromosome position and a gene position (for any biologists reading, this is a per gene manhattan hence the sparsity of signal). The data looks like this (with an example dataset below:
SNP P CHR BP
ABC 1.1e-300 16 875849
AAS 1.2e-150 4 2343
JTL 4.2e-07 3 436544
LKS 4.1e-06 2 23565
JKSA 0.000432 1 98043
LKF 0.0032 22 387235
A20 0.0054 10 3252
AKLF 0.0235 4 4543543
structure(list(Gene = c("ABC1", "HGT2", "SLC34A3_ENSG00000198569",
"OR9K2_ENSG00000170605", "NFKB2_ENSG00000077150", "EFR3A_ENSG00000132294",
"SLC7A9_ENSG00000021488", "SEMG1_ENSG00000124233", "EWSR1_ENSG00000182944",
"ATP5PD_ENSG00000167863", "MAST3_ENSG00000099308", "KRT31_ENSG00000094796",
"FOXI1_ENSG00000168269", "CHCHD7_ENSG00000170791", "MAPK6_ENSG00000069956",
"SPRYD3_ENSG00000167778", "HOXB13_ENSG00000159184", "SLC12A9_ENSG00000146828",
"EXOC2_ENSG00000112685", "KCNJ15_ENSG00000157551", "SLC22A18_ENSG00000110628",
"ARID4A_ENSG00000032219", "SKP2_ENSG00000145604", "ZNF831_ENSG00000124203",
"ZNF275_ENSG00000063587", "SLC16A2_ENSG00000147100", "ADRB1_ENSG00000043591",
"DSCAM_ENSG00000171587", "PPM1H_ENSG00000111110", "IFNA14_ENSG00000228083",
"STX2_ENSG00000111450", "VPS54_ENSG00000143952", "ANXA7_ENSG00000138279",
"MAP3K12_ENSG00000139625", "MED13L_ENSG00000123066", "CHRM2_ENSG00000181072",
"RBP7_ENSG00000162444", "DRD1_ENSG00000184845", "CCDC121_ENSG00000176714",
"HMG20B_ENSG00000064961", "POU5F1B_ENSG00000212993", "SESN1_ENSG00000080546",
"DNASE1_ENSG00000213918", "FBXO24_ENSG00000106336", "RAG2_ENSG00000175097",
"UTS2_ENSG00000049247", "KMT2B_ENSG00000272333", "RBM33_ENSG00000184863",
"SNRPB2_ENSG00000125870", "FOXO4_ENSG00000184481", "NBPF3_ENSG00000142794",
"PPL_ENSG00000118898", "LYPD6B_ENSG00000150556", "POLD3_ENSG00000077514",
"PIK3CB_ENSG00000051382", "BCL2L12_ENSG00000126453", "CDC45_ENSG00000093009",
"DUXA_ENSG00000258873", "MCM3_ENSG00000112118", "CAPN3_ENSG00000092529",
"FMO4_ENSG00000076258", "B3GALT2_ENSG00000162630", "MICB_ENSG00000204516",
"CCL22_ENSG00000102962", "JKAMP_ENSG00000050130", "GSDME_ENSG00000105928",
"IZUMO4_ENSG00000099840", "NCKAP5L_ENSG00000167566", "ZRANB1_ENSG00000019995",
"TAL1_ENSG00000162367", "SLTM_ENSG00000137776", "SPC25_ENSG00000152253",
"GAP43_ENSG00000172020", "FGD3_ENSG00000127084", "PTCD3_ENSG00000132300",
"PAH_ENSG00000171759", "MMP8_ENSG00000118113", "RSBN1L_ENSG00000187257",
"AC026740.3_ENSG00000286094", "FAM189A2_ENSG00000135063", "TMEM245_ENSG00000106771",
"DDX50_ENSG00000107625", "SP140_ENSG00000079263", "C21orf91_ENSG00000154642",
"MEIKIN_ENSG00000239642", "TNFRSF8_ENSG00000120949", "RNF24_ENSG00000101236",
"CDK5_ENSG00000164885", "HINT1_ENSG00000169567", "TYRO3_ENSG00000092445",
"KRT75_ENSG00000170454", "RBM44_ENSG00000177483", "MYH8_ENSG00000133020",
"UBXN11_ENSG00000158062", "APOL3_ENSG00000128284", "NRXN3_ENSG00000021645",
"PRSS16_ENSG00000112812", "BST1_ENSG00000109743", "FAM49A_ENSG00000197872",
"SLC3A2_ENSG00000168003", "OR1C1_ENSG00000221888", "MYMK_ENSG00000187616",
"RASSF1_ENSG00000068028", "ARID5A_ENSG00000196843", "UAP1L1_ENSG00000197355",
"DPH2_ENSG00000132768", "G6PC_ENSG00000131482", "SH2B1_ENSG00000178188",
"RELL1_ENSG00000181826", "ABCC5_ENSG00000114770", "ZNF333_ENSG00000160961",
"NIF3L1_ENSG00000196290", "COMMD2_ENSG00000114744", "ZCCHC14_ENSG00000140948",
"P3H1_ENSG00000117385", "KRT14_ENSG00000186847", "SPG7_ENSG00000197912",
"ERCC6L_ENSG00000186871", "UPF1_ENSG00000005007", "FCGR3A_ENSG00000203747",
"SLC39A13_ENSG00000165915", "ACYP2_ENSG00000170634", "AL162596.1_ENSG00000285946",
"MEF2D_ENSG00000116604", "ATPAF1_ENSG00000123472", "DNAL4_ENSG00000100246",
"ADRA2A_ENSG00000150594", "ALDH3B2_ENSG00000132746", "L3MBTL3_ENSG00000198945",
"NR2E1_ENSG00000112333", "OTUD1_ENSG00000165312", "MCMDC2_ENSG00000178460",
"TXNL1_ENSG00000091164", "CES5A_ENSG00000159398", "CCL16_ENSG00000275152",
"ZBTB12_ENSG00000204366", "OGDHL_ENSG00000197444", "ARHGEF7_ENSG00000102606",
"RBM20_ENSG00000203867", "SELENOK_ENSG00000113811", "HBB_ENSG00000244734",
"WDR3_ENSG00000065183", "MAPKBP1_ENSG00000137802", "LTB4R2_ENSG00000213906",
"SLC25A15_ENSG00000102743", "ZBTB26_ENSG00000171448", "FDX2_ENSG00000267673",
"HSD3B7_ENSG00000099377", "RBFOX3_ENSG00000167281"), Pvalue = c(1.4e-300,
2.4e-150, 2.6089114579797e-07, 2.0296620694138e-06, 0.000147497259292417,
0.000229023886289315, 0.000245084674285079, 0.000256308708221289,
0.000261527824152563, 0.000288694716678695, 0.000290173032394758,
0.000320594572326915, 0.000346135729902497, 0.000355400110852,
0.000365256352980237, 0.000409731023356175, 0.000434204786603609,
0.000439775242591978, 0.000489192731765176, 0.000496753250110893,
0.00049911036273298, 0.000570787086811797, 0.000817460863988795,
0.000909350865229142, 0.000939159281654778, 0.00101875263711804,
0.00104161722087825, 0.00104642519111031, 0.0011025121215934,
0.00110797190460954, 0.00115516532029414, 0.00119237737210043,
0.00122886113380205, 0.00123316670384388, 0.00126924175390097,
0.00133083135434398, 0.00135900612361495, 0.00139601886941515,
0.00140034988031684, 0.00144667154281775, 0.00152488013161856,
0.00163920217629621, 0.00165121328565765, 0.00174281606991877,
0.00177541992540164, 0.00190567015024483, 0.00197012178338563,
0.00201154365191081, 0.00217761616500045, 0.00218849598206619,
0.00219107805420338, 0.00219952638949095, 0.0022100400174857,
0.00224988976742913, 0.00227842036080439, 0.00231351589815465,
0.00233840710255306, 0.00239368490047076, 0.00240800589782486,
0.00243072813003242, 0.00244930354205075, 0.00250643393459327,
0.00251262640919065, 0.00251308387281417, 0.00263512458389692,
0.00278748971622167, 0.00285692531240396, 0.00294631292976411,
0.0029855292366705, 0.00300042887433971, 0.00303321747691876,
0.00303431537337207, 0.00303655747990805, 0.00305247991142066,
0.00305779719421262, 0.0030773769185013, 0.00309595279588104,
0.00320602521859303, 0.00332374190234568, 0.00335845666631385,
0.00343476781423846, 0.00352132856036713, 0.0035370791144882,
0.00361921945446442, 0.00362829729460107, 0.00362925899436917,
0.00371857751928739, 0.00379170913533391, 0.00381786051662956,
0.00384603142808415, 0.0040621114920355, 0.00409131954647834,
0.00421076475281379, 0.00426968726537658, 0.00434706101829539,
0.00440972006588558, 0.00441860470852284, 0.00442578968523244,
0.00442716922579578, 0.00452215526426547, 0.00455658711791962,
0.00456768818316559, 0.00459525378983388, 0.00470562811526665,
0.00479427416502232, 0.00480697291736709, 0.00487609777383424,
0.00487626066774249, 0.0048982035968409, 0.00495106368869058,
0.00495974901689888, 0.0051182254688722, 0.00511868853158659,
0.00517459699358158, 0.0051863728177568, 0.0052533748441207,
0.0053048513357663, 0.00535144603215779, 0.00536294574878726,
0.00551084451782391, 0.00554884846488313, 0.0057184975334863,
0.00579274777888456, 0.00589230566622367, 0.00598698264647979,
0.00611781183554826, 0.00620691435617104, 0.00623285869674561,
0.00627192651777919, 0.00631120768525961, 0.00638288332792991,
0.00640000445930411, 0.00640676243762089, 0.00651734394089964,
0.0065624463096069, 0.00663922011120555, 0.00664879787639161,
0.00670461778135323, 0.00687266504207529, 0.00695679654393111,
0.00703352727799, 0.0070826001238915, 0.00709135444023445, 0.007142701991454,
0.00715597471729579, 0.00717318609326256, 0.00717726401691021,
0.00723420182380741, 0.00734437099984853), CHR = c(16L, 4L, 4L,
1L, 14L, 16L, 5L, 6L, 20L, 9L, 9L, 7L, 22L, 3L, 14L, 3L, 8L,
8L, 21L, 16L, 4L, 16L, 12L, 14L, 4L, 1L, 12L, 15L, 5L, 4L, 21L,
22L, 1L, 1L, 14L, 6L, 15L, 9L, 20L, 20L, 17L, 7L, 15L, 6L, 20L,
7L, 8L, 9L, 1L, 13L, 11L, 12L, 4L, 7L, 20L, 12L, 7L, 5L, 12L,
21L, 5L, 8L, 14L, 9L, 10L, 17L, 21L, 19L, 4L, 21L, 18L, 21L,
7L, 12L, 21L, 2L, 15L, 7L, 14L, 15L, 4L, 12L, 5L, 14L, 21L, 8L,
21L, 15L, 18L, 12L, 11L, 20L, 2L, 22L, 14L, 17L, 3L, 4L, 14L,
15L, 9L, 7L, 20L, 15L, 18L, 15L, 19L, 13L, 15L, 6L, 7L, 8L, 3L,
4L, 21L, 7L, 18L, 4L, 13L, 16L, 14L, 22L, 2L, 2L, 6L, 16L, 15L,
8L, 7L, 19L, 13L, 6L, 21L, 8L, 18L, 22L, 19L, 21L, 16L, 2L, 4L,
5L, 15L, 6L, 3L, 21L, 15L, 4L, 11L), POS = c(40665L, 197088L,
107291L, 210681L, 43546L, 79324L, 84342L, 184478L, 153093L, 180926L,
186110L, 117933L, 40682L, 54752L, 42758L, 61354L, 60378L, 157811L,
154466L, 126398L, 31037L, 115113L, 151914L, 10177L, 149587L,
79681L, 199754L, 129963L, 127032L, 175940L, 213708L, 51165L,
2584L, 166487L, 56259L, 130923L, 89219L, 170034L, 178967L, 102826L,
16982L, 188528L, 185007L, 6373L, 23298L, 199514L, 10429L, 58720L,
124518L, 210323L, 52212L, 186662L, 166963L, 58802L, 97157L, 14448L,
205795L, 70401L, 41824L, 93825L, 107954L, 207638L, 58648L, 64942L,
184005L, 19239L, 326L, 167713L, 106774L, 9145L, 174348L, 116079L,
38916L, 561L, 140433L, 123765L, 92497L, 187902L, 32027L, 63696L,
141286L, 67825L, 131698L, 120443L, 72621L, 165143L, 188862L,
52376L, 16769L, 77430L, 38655L, 145317L, 188469L, 113143L, 198322L,
26732L, 165043L, 25287L, 72392L, 12505L, 134208L, 126649L, 86308L,
199525L, 204348L, 103538L, 78610L, 176290L, 175950L, 73590L,
148494L, 151769L, 135252L, 141200L, 73351L, 45244L, 136493L,
33343L, 11165L, 915L, 80714L, 164700L, 142935L, 137224L, 554L,
92823L, 143083L, 166581L, 121459L, 19037L, 325L, 59959L, 155468L,
20896L, 33721L, 4468L, 113639L, 17103L, 184481L, 164337L, 174760L,
96405L, 207423L, 46590L, 168811L, 205743L, 74180L, 178456L, 126892L
)), row.names = c(NA, -149L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x55a80de817a0>)
In reality there are around 20,000 lines for each gene in the human genome.
Using qqman, one uses:
manhttahn(gwas_data...)
To get the plot.
I would like the same plot but with the axis broken between 8-149 and then again from 149-300 so that the bottom part isn't all compressed. qqman is unable to do this.
I have tried modifying the script from this website: https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/
And my code looks like this:
table above: gwas_data
data_cum <- gwas_data %>%
group_by(CHR) %>%
summarise(max_bp = max(BP)) %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>%
select(CHR, bp_add)
gwas_data <- gwas_data %>%
inner_join(data_cum, by = "CHR") %>%
mutate(bp_cum = bp + bp_add)
axis_set <- gwas_data %>%
group_by(CHR) %>%
summarize(center = mean(bp_cum))
ylim <- gwas_data %>%
filter(P == min(P)) %>%
mutate(ylim = abs(floor(log10(P))) + 2) %>%
pull(ylim)
sig <- 0.05/length(gwas_data$P) #this is a bonferroni correction
manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(P),
color = as_factor(CHR), size = -log10(P))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(p)") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)
This gives me:
Which is wrong. However, if I try and then cut the axis using the ggbreak package with:
t <- manhplot +scale_y_cut(break=c(10,140))
t+ scale_y_cut(break=c(140,300))
Which gives me:
How would I sort the chromosome x-axis and the breaks out so it looks like the qqman plot but with the y-axis compressed?
Many thanks
I am trying to write a code that finds the 3 consecutives months that are the coldest.
For now I have written a code for the 3 first months (1,2,3) but then it should be applied to (4,5,6), (7,8,9), (10,11,12), (2,3,4), (5,6,7), (8,9,10), (11,12,1), (3,4,5), (6,7,8), (9,10,11) and (12,1,2) which are all the possible combinations of 3 consecutives months.
The code I wrote is here :
cold <- data_example %>%
group_by(Site) %>%
filter(Month %in% c(1,2,3)) %>%
mutate(mean_temperature = mean(t_q)) %>%
dplyr::select(-c(t_q,Month)) %>%
distinct(Site, mean_temperature)
average_temp_month_1_2_3 <- cold$mean_temperature
Then I replaced the c(1,2,3) by all possiblities, I have created a new column for each output.
I end up with a dataset with row corresponding to Site and columns are all the possibilities of 3 consecutive months.
After I took the min value for each row using the function apply() and min() and it gives me the coldest quarter for each Site.
I am looking for a way to generalize it, like creating a loop on the possiblities.
The structure of data_example is as follow :
structure(list(Site = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L), Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), t_q = c(9.67754848470332, -6.74555496540183,
5.67969761460384, 12.537207581471, -9.4899105618945, 21.0747672424502,
15.2643039243614, -3.62839910494421, 11.3919028351455, 1.69988257436554,
4.22015024307287, 11.7045830784212, 8.91437673833493, 0.579081429509138,
-10.8207481229903, 7.05356868592628, 13.0911580912516, 17.2032089167605,
-2.47642708849114, -11.2105599344486, 33.986736305027, 17.8578689773214,
-14.9114468266335, 14.4681380389141, 0.568074240873411, 7.65458408777801,
1.91368344556659, 6.01571556896127, 11.4858297513536, 2.2608458985328,
-2.08200762781776, 12.1540989284163, 20.9941815285413, 0.375777604316208,
-2.7137027317614, -6.17690210400591, 11.2549857164403, 17.447156776654,
-6.96565197389579, -5.41542361226991, 11.1680111873065, 16.2266522778922,
-11.4503938582433, 5.93300314835716, -18.2818398656237, 16.2930210946949,
9.80219192652316, -0.48237356523527, 7.72680942503686, 5.84113084181759,
9.66129413490096, -4.53018262186904, 7.42187509892118, 9.2559478576895,
8.25120948667013, 8.18182063263247, 16.3703081943971, 19.5469951420341,
3.71888263185773, -0.150179891749435, 1.32057298670562, -5.63556532224354,
21.3918542474341, 4.58752188336035, 5.49430262894033, 5.99587512047837,
-3.76459024109216, -8.53522098071824, 8.01805680562232, 26.2227490426066,
8.90822434139878, 5.04259034084471, 6.89740304247746, 11.9484584922927,
-11.5085102739471, 30.4526759119379, 21.878533782357, -5.39936677076962,
-9.83965056853816, 19.3083455159472, 7.90653548036154, 3.11876660277767,
-8.85027083180008, -9.9225496831988, 5.97307112581907, -2.83528336599284,
-2.75758002814396, 4.68388181004449, 6.61649031537118, -6.65988084338133,
-0.981075313384259, 5.84898952305179, -5.20962191660178, 0.416662319713158,
-10.5336993269853, 19.5350642296553, 26.9696625385792, 15.3291059661081,
15.0799591208354, 13.2310653499033, 7.2053382722482, -7.87288386491102,
20.8083797469715, 6.16664220270041, 8.3360949793043, -14.4000921795463,
-10.5503025782944, 14.3185205291177, 5.83802399796341, 2.49660818997943,
15.7399297014092, -0.834086173817971, 12.4883230222372, 6.73548467376379,
7.7988835803825, -5.13583355913738, 7.51054162811707, 11.6610602814336,
-11.8864185954223, 4.2704440943851)), row.names = c(NA, -120L
), groups = structure(list(Site = c(4L, 5L, 13L, 14L, 15L, 16L,
17L, 18L, 25L, 26L), .rows = structure(list(1:12, 13:24, 25:36,
37:48, 49:60, 61:72, 73:84, 85:96, 97:108, 109:120), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can use raster::movingFun to do a moving average with circular data, then use slice_min to get the minimum value per group.
library(dplyr)
circ <- function(x, by) ifelse(x%%by == 0, by, x%%by)
df %>%
group_by(Site) %>%
mutate(rolmean = raster::movingFun(t_q, n = 3, fun = mean, circular = TRUE)) %>%
slice_min(rolmean) %>%
mutate(coldest = toString(circ(c(Month-1, Month, Month+1), 12)))
output
# A tibble: 10 × 5
# Groups: Site [10]
Site Month t_q rolmean coldest
<int> <int> <dbl> <dbl> <chr>
1 4 2 -6.75 2.87 1, 2, 3
2 5 3 -10.8 -1.06 2, 3, 4
3 13 11 -2.71 -2.84 10, 11, 12
4 14 8 5.93 -7.93 7, 8, 9
5 15 3 9.66 3.66 2, 3, 4
6 16 7 -3.76 -2.10 6, 7, 8
7 17 11 -8.85 -5.22 10, 11, 12
8 18 10 0.417 -5.11 9, 10, 11
9 25 10 -14.4 -5.54 9, 10, 11
10 26 12 4.27 -0.593 11, 12, 1
Using which.min in aggregate on a moving average window.
aggregate(t_q ~ Site, dat, \(s) {
win <- 3 ## window length
sq <- Map(seq, 1:(length(s) - win + 1), win:length(s))
toString(sq[[which.min(sapply(sq, \(sq) mean(s[sq])))]])
})
# Site t_q
# 1 4 1, 2, 3
# 2 5 2, 3, 4
# 3 13 10, 11, 12
# 4 14 7, 8, 9
# 5 15 2, 3, 4
# 6 16 6, 7, 8
# 7 17 10, 11, 12
# 8 18 9, 10, 11
# 9 25 9, 10, 11
# 10 26 10, 11, 12
My data was 450K (DNA methylation data). And the results below are from regional analysis. It contains three columns: the chromosome number, start position, and end position:
region <- structure(list(chr = c(2L, 2L, 2L, 3L, 4L, 5L, 5L, 5L, 6L, 6L, 7L, 8L, 10L, 11L, 12L, 15L, 16L, 18L, 18L, 21L, 22L), start = c(95663987L, 80531500L, 154334651L, 24536765L, 187476837L, 16179633L, 2751822L, 63461803L, 133562246L, 29521568L, 49813031L, 24772270L, 128593922L, 30038286L, 6649733L, 65913660L, 51184152L, 6414602L, 5543801L, 22370347L, 24890330L), end = c(95664360L, 80531899L, 154334652L, 24537302L, 187476838L, 16180267L, 2752602L, 63461931L, 133562777L, 29521715L, 49813487L, 24772351L, 128594418L, 30038311L, 6649995L, 65913661L, 51184887L, 6415253L, 5543946L, 22370759L, 24891142L)), class = "data.frame", row.names = c(4L, 12L, 15L, 14L, 20L,8L, 10L, 18L, 1L, 16L, 5L, 6L, 2L, 21L, 9L, 17L, 13L, 7L, 19L, 11L, 3L))
The distribution in my region is:
table(region$chr)
The first chromosome is chr2, chich contains four regions here.
Now I have another probe file, which contains probes with their chromosomes and positions. What I want to do is to extract the probes that are lociated in my target regions. Here is probe file:
probe <- structure(list(chr = c(6L, 12L, 16L, 1L, 13L, 17L, 16L, 13L, 3L, 17L, 20L, 8L, 12L, 17L, 8L, 6L, 15L, 16L, 16L, 16L, 6L, 1L, 7L, 18L, 2L, 8L, 16L, 10L, 11L, 12L, 1L, 15L, 1L, 11L, 13L, 13L, 6L, 6L, 9L, 12L, 1L, 12L, 13L, 13L, 6L, 1L, 2L, 3L, 11L, 22L, 15L, 11L, 19L, 19L, 1L, 6L, 10L, 3L, 4L, 17L, 10L, 8L, 6L, 2L, 8L, 16L, 1L, 2L, 16L, 9L, 6L, 19L, 10L, 4L, 4L, 17L, 11L, 4L, 1L, 1L, 5L, 3L, 12L, 16L, 7L, 11L, 4L, 6L, 19L, 14L, 17L, 1L, 4L, 7L, 11L, 5L, 5L, 2L, 2L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), pos = c(159064992L, 114367005L, 28835671L, 200003800L, 42692969L, 73780663L, 65236094L, 114057675L, 23713773L, 56326765L, 44142512L, 103668081L, 111806472L, 4437077L, 8871457L, 143771621L, 29993498L, 696801L, 79623625L, 69385761L, 30685686L, 76190435L, 14031049L, 3732002L, 32853151L, 146233339L, 71757240L, 131844944L, 128424176L, 89749142L, 27693242L, 57138252L, 43123399L, 57407842L, 29067224L, 53191387L, 30921630L, 107971593L, 125133314L, 109915400L, 46668882L, 14720858L, 67804654L, 23500367L, 170398571L, 150241781L, 85843232L, 15106710L, 33758223L, 44350860L, 83726483L, 76814245L, 3789435L, 55013663L, 166846008L, 150289488L, 3187835L, 169684620L, 1340602L, 35297146L, 61569177L, 122954569L, 71276472L, 9563665L, 9952926L, 81040735L, 15392793L, 55183957L, 27228679L, 139334396L, 44090748L, 3979938L, 125425262L, 10687769L, 503198L, 55191642L, 19735701L, 184244831L, 10738664L, 17446073L, 140739501L, 49384054L, 56618196L, 71324066L, 27221689L, 8041137L, 149033953L, 169224907L, 3933591L, 76450658L, 46152449L, 93250590L, 1025591L, 37024552L, 1360335L, 156277860L, 157098423L, 85980756L, 2575755L, 142138643L, 80531898L, 80531597L, 80531656L, 95664233L, 95664359L, 95664243L, 80531645L, 80531599L, 80531500L, 80531842L, 95663987L, 80531751L, 154334651L, 80531633L)), row.names = c("cg13598865", "cg02666265", "cg16662787", "cg10513702", "cg10970751", "cg08536977", "cg09084496", "cg08794696", "cg18648917", "cg20272962", "cg03013946", "cg07028608", "cg10361696", "cg06618629", "cg25307778", "cg00888489", "cg21092551", "cg07760369", "cg04317962", "cg08627125", "cg18512512", "cg13901901", "cg13524180", "cg18761756", "cg23633993", "cg07013148", "cg06190759", "cg14070745", "cg11552868", "cg26635451", "cg03201274", "cg25063425", "cg04482817", "cg05082527", "cg24850711", "cg25194273", "cg18964706", "cg01485362", "cg14154487", "cg22511293", "cg01431908", "cg20219035", "cg18855836", "cg06743703", "cg07489447", "cg16269716", "cg12737876", "cg00001245", "cg24871046", "cg07065008", "cg02104456", "cg13466901", "cg17880816", "cg23352067", "cg26870903", "cg12489846", "cg04144333", "cg02399652", "cg24269412", "cg03146993", "cg17307051", "cg20129534", "cg07968224", "cg07814910", "cg02192555", "cg07629951", "cg13322252", "cg18456312", "cg02871891", "cg07874283", "cg26371345", "cg07663404", "cg07036530", "cg17677988", "cg16619777", "cg25182165", "cg20686479", "cg04184793", "cg22513691", "cg17183414", "cg04246144", "cg05383531", "cg25245322", "cg02244933", "cg05516617", "cg11111132", "cg07760722", "cg05357093", "cg08248181", "cg00780666", "cg26932693", "cg14681854", "cg23853026", "cg08044454", "cg22317004", "cg05907764", "cg05482973", "cg03128635", "cg01968492", "cg03460049", "cg00465284", "cg00549910", "cg02856109", "cg03445516", "cg06816651", "cg09409539", "cg09482777", "cg11231249", "cg12078605", "cg21621248", "cg24871414", "cg26355577", "cg26649384", "cg27629977"), class = "data.frame")
Below was what I tried: extracted probes chromosome by chromosome, and region by region. Let's take the chr2 for example.
chr2 %>% probe %>% subset(chr==2) %>% subset(pos >= 95663987 & pos <= 95664360 | pos >= 80531500 & pos <= 80531899 | pos >= 154334651 & pos <= 154334652)
It worked well and showed 14 probes that are located in these four regions. However, my real region file have many more regions whitin each chromosome. It will be time comsuming to put all the "start" and "end" number in the code. So I want to have a easier code to extract the probes, at least chromosome by chromosome.
Below was what I tried:
chr2.df <- probe %>% subset(chr==2) %>% subset(pos >= region$start & pos <= region$end)
It showed no regions...
Can anyone help me with it - how to extract the probes not by using the detail "start" and "end" number in the region file?
Thank you so much.
If your goal is to identify probes that lie in each chromosomal region, then I think that this code will suffice:
library(magrittr)
pdf <- tibble::as_tibble(probe ) %>% dplyr::mutate(probe = rownames(probe))
region %>%
tibble::as_tibble() %>%
dplyr::left_join(pdf, by = "chr") %>%
dplyr::filter(pos < end, pos > start)
I first load the package magrittr, which lets me use the "pipe" function, %>%. I then create a tibble (a data frame) with probe as a (new) column. This reflects my preference to not use rownames with data frames.
I then convert region to a tibble (a type of data frame) before piping it to the left_join function from dplyr package. This function "merges" or "joins" the two data frames by common values of "chr". Since there are repeated values of "chr" in both region and pdf, we get multiple lines with, for example, "chr" value of 2.
Lastly, I use the function filter from dplyr to choose only those rows that have a pos value between start and end.
I hope that this helps.
Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))
I am trying to evaluate if different populations reach different asymptotes using NLS, in R. Here I have two data.frames df1 has only one population (Represented by Site)
df1<- structure(list(Site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ALT01",
"ALT02", "ALT03", "Cotton", "Deep", "Eckhardt", "Green", "Johnson",
"Kissinger", "Marsh", "Sand", "Shypoke", "Sora", "Spike", "Tamora",
"WRP01", "WRP05", "WRP08", "WRP10", "WRP11", "WRP12", "WRP14",
"WRP15", "WRP18"), class = "factor"), Nets = 1:18, Cumulative.spp = c(12L,
13L, 15L, 17L, 17L, 17L, 17L, 19L, 19L, 19L, 19L, 20L, 22L, 22L,
22L, 22L, 22L, 22L)), .Names = c("Site", "Nets", "Cumulative.spp"
), row.names = c(NA, 18L), class = "data.frame")
and df2 has to populations (Again represented by Site)
df2 <- structure(list(Site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("ALT01",
"ALT02", "ALT03", "Cotton", "Deep", "Eckhardt", "Green", "Johnson",
"Kissinger", "Marsh", "Sand", "Shypoke", "Sora", "Spike", "Tamora",
"WRP01", "WRP05", "WRP08", "WRP10", "WRP11", "WRP12", "WRP14",
"WRP15", "WRP18"), class = "factor"), Nets = c(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L), Cumulative.spp = c(12L, 13L, 15L, 17L, 17L,
17L, 17L, 19L, 19L, 19L, 19L, 20L, 22L, 22L, 22L, 22L, 22L, 22L,
7L, 10L, 11L, 12L, 13L, 14L, 14L, 14L, 15L, 15L, 16L, 16L, 16L,
16L, 16L, 17L, 17L, 17L)), .Names = c("Site", "Nets", "Cumulative.spp"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 25L, 26L, 27L, 28L, 29L, 30L,
31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L), class = "data.frame")
When I model for one population everything looks great:
Model1<-nls(Cumulative.spp ~ SSasympOff(Nets, A, lrc, c0), data = df1)
What I am trying to do is see if I can add several populations to the same model and add a Site Variable, I have tried this:
Model2<-nls(Cumulative.spp ~ SSasympOff(Nets, A, lrc, c0) + Site , data = df2)
and this:
Model2<-nls(Cumulative.spp ~ SSasympOff(Nets + Site , A, lrc, c0), data = df2)
But no luck so far, any help would be appreciated.
We assume that you want to have different Asym parameters for the two populations but common lrc and c0 parameters.
First in (1) we show how to modify the solution in the question to get the answer. Most of the code in (1) is just to get starting values but the actual fit is only one line of code -- two lines if you count the fact that we defined the formula in a separate line.
Then in (2) we show how to simplify (1) by using algorithm "plinear" eliminating the need to get starting values for the linear parameters. In (2a) we show a further simplification which extends more readily to more sites and in (2b) we simplify that further under the condition that all sites are present (which is not the case in the question but may be the case in the real data).
1) default algorithm We can get starting values in nls by fitting each population separately (fm1, fm2) and together (fm3). Finally fit the model with different Asym parameters (fm4).
# get starting values
fo <- Cumulative.spp ~ SSasympOff(Nets, A, lrc, c0)
fm1 <- nls(fo, df2, subset = Site == "ALT01")
fm2 <- nls(fo, df2, subset = Site == "ALT03")
fm3 <- nls(fo, df2)
st <- c(A1 = coef(fm1)[["A"]], A2 = coef(fm2)[["A"]], coef(fm3)[c("lrc", "c0")])
# fit
fo4 <- Cumulative.spp ~ SSasympOff(Nets, A1*(Site=="ALT01")+A2*(Site=="ALT03"), lrc, c0)
fm4 <- nls(fo4, data = df2, start = st)
plot(Cumulative.spp ~ Nets, df2, col = Site)
points(fitted(fm4) ~ Nets, df2, col = "red", pch = 20)
2) plinear Actually Asym is special since the model is linear in it and we can use this to simplify the above as we don't need starting values for the linear parameters if we switch to algorithm="plinear". This eliminates the need to run fm1 and fm2. We only need fm3 to generate starting values. Note that "plinear" requires that the RHS of the formula be a matrix with each column multiplying the coefficient of one linear parameter. Here we have two linear parameters (the Asym for each Site) so it is a two-column matrix.
# get starting values
fo <- Cumulative.spp ~ SSasympOff(Nets, A, lrc, c0)
fm3 <- nls(fo, df2)
st5 <- coef(fm3)[c("lrc", "c0")]
# fit
mm <- with(df2, cbind(Site=="ALT01", Site=="ALT03"))
fo5 <- Cumulative.spp ~ mm * SSasympOff(Nets,1,lrc,c0)
fm5 <- nls(fo5, data = df2, start = st5, algorithm = "plinear")
2a) mm could alternately be written like this which has the advantage that it extends to more sites:
mm <- model.matrix(~ Site - 1, transform(df2, Site = droplevels(Site)))
2b) If all levels of the Site factor are represented in the data then we could simplify even further as droplevels(Site) (which drops the unused levels) could then be simply Site allowing us to write:
mm <- model.matrix(~ Site - 1, df2)
Update: Some fixes and improvements.