Related
I have a manhattan plot of genetic information:
It was generated using the qqman package https://cran.r-project.org/web/packages/qqman/vignettes/qqman.html) in R which takes a dataframe of P-values, chromosome position and a gene position (for any biologists reading, this is a per gene manhattan hence the sparsity of signal). The data looks like this (with an example dataset below:
SNP P CHR BP
ABC 1.1e-300 16 875849
AAS 1.2e-150 4 2343
JTL 4.2e-07 3 436544
LKS 4.1e-06 2 23565
JKSA 0.000432 1 98043
LKF 0.0032 22 387235
A20 0.0054 10 3252
AKLF 0.0235 4 4543543
structure(list(Gene = c("ABC1", "HGT2", "SLC34A3_ENSG00000198569",
"OR9K2_ENSG00000170605", "NFKB2_ENSG00000077150", "EFR3A_ENSG00000132294",
"SLC7A9_ENSG00000021488", "SEMG1_ENSG00000124233", "EWSR1_ENSG00000182944",
"ATP5PD_ENSG00000167863", "MAST3_ENSG00000099308", "KRT31_ENSG00000094796",
"FOXI1_ENSG00000168269", "CHCHD7_ENSG00000170791", "MAPK6_ENSG00000069956",
"SPRYD3_ENSG00000167778", "HOXB13_ENSG00000159184", "SLC12A9_ENSG00000146828",
"EXOC2_ENSG00000112685", "KCNJ15_ENSG00000157551", "SLC22A18_ENSG00000110628",
"ARID4A_ENSG00000032219", "SKP2_ENSG00000145604", "ZNF831_ENSG00000124203",
"ZNF275_ENSG00000063587", "SLC16A2_ENSG00000147100", "ADRB1_ENSG00000043591",
"DSCAM_ENSG00000171587", "PPM1H_ENSG00000111110", "IFNA14_ENSG00000228083",
"STX2_ENSG00000111450", "VPS54_ENSG00000143952", "ANXA7_ENSG00000138279",
"MAP3K12_ENSG00000139625", "MED13L_ENSG00000123066", "CHRM2_ENSG00000181072",
"RBP7_ENSG00000162444", "DRD1_ENSG00000184845", "CCDC121_ENSG00000176714",
"HMG20B_ENSG00000064961", "POU5F1B_ENSG00000212993", "SESN1_ENSG00000080546",
"DNASE1_ENSG00000213918", "FBXO24_ENSG00000106336", "RAG2_ENSG00000175097",
"UTS2_ENSG00000049247", "KMT2B_ENSG00000272333", "RBM33_ENSG00000184863",
"SNRPB2_ENSG00000125870", "FOXO4_ENSG00000184481", "NBPF3_ENSG00000142794",
"PPL_ENSG00000118898", "LYPD6B_ENSG00000150556", "POLD3_ENSG00000077514",
"PIK3CB_ENSG00000051382", "BCL2L12_ENSG00000126453", "CDC45_ENSG00000093009",
"DUXA_ENSG00000258873", "MCM3_ENSG00000112118", "CAPN3_ENSG00000092529",
"FMO4_ENSG00000076258", "B3GALT2_ENSG00000162630", "MICB_ENSG00000204516",
"CCL22_ENSG00000102962", "JKAMP_ENSG00000050130", "GSDME_ENSG00000105928",
"IZUMO4_ENSG00000099840", "NCKAP5L_ENSG00000167566", "ZRANB1_ENSG00000019995",
"TAL1_ENSG00000162367", "SLTM_ENSG00000137776", "SPC25_ENSG00000152253",
"GAP43_ENSG00000172020", "FGD3_ENSG00000127084", "PTCD3_ENSG00000132300",
"PAH_ENSG00000171759", "MMP8_ENSG00000118113", "RSBN1L_ENSG00000187257",
"AC026740.3_ENSG00000286094", "FAM189A2_ENSG00000135063", "TMEM245_ENSG00000106771",
"DDX50_ENSG00000107625", "SP140_ENSG00000079263", "C21orf91_ENSG00000154642",
"MEIKIN_ENSG00000239642", "TNFRSF8_ENSG00000120949", "RNF24_ENSG00000101236",
"CDK5_ENSG00000164885", "HINT1_ENSG00000169567", "TYRO3_ENSG00000092445",
"KRT75_ENSG00000170454", "RBM44_ENSG00000177483", "MYH8_ENSG00000133020",
"UBXN11_ENSG00000158062", "APOL3_ENSG00000128284", "NRXN3_ENSG00000021645",
"PRSS16_ENSG00000112812", "BST1_ENSG00000109743", "FAM49A_ENSG00000197872",
"SLC3A2_ENSG00000168003", "OR1C1_ENSG00000221888", "MYMK_ENSG00000187616",
"RASSF1_ENSG00000068028", "ARID5A_ENSG00000196843", "UAP1L1_ENSG00000197355",
"DPH2_ENSG00000132768", "G6PC_ENSG00000131482", "SH2B1_ENSG00000178188",
"RELL1_ENSG00000181826", "ABCC5_ENSG00000114770", "ZNF333_ENSG00000160961",
"NIF3L1_ENSG00000196290", "COMMD2_ENSG00000114744", "ZCCHC14_ENSG00000140948",
"P3H1_ENSG00000117385", "KRT14_ENSG00000186847", "SPG7_ENSG00000197912",
"ERCC6L_ENSG00000186871", "UPF1_ENSG00000005007", "FCGR3A_ENSG00000203747",
"SLC39A13_ENSG00000165915", "ACYP2_ENSG00000170634", "AL162596.1_ENSG00000285946",
"MEF2D_ENSG00000116604", "ATPAF1_ENSG00000123472", "DNAL4_ENSG00000100246",
"ADRA2A_ENSG00000150594", "ALDH3B2_ENSG00000132746", "L3MBTL3_ENSG00000198945",
"NR2E1_ENSG00000112333", "OTUD1_ENSG00000165312", "MCMDC2_ENSG00000178460",
"TXNL1_ENSG00000091164", "CES5A_ENSG00000159398", "CCL16_ENSG00000275152",
"ZBTB12_ENSG00000204366", "OGDHL_ENSG00000197444", "ARHGEF7_ENSG00000102606",
"RBM20_ENSG00000203867", "SELENOK_ENSG00000113811", "HBB_ENSG00000244734",
"WDR3_ENSG00000065183", "MAPKBP1_ENSG00000137802", "LTB4R2_ENSG00000213906",
"SLC25A15_ENSG00000102743", "ZBTB26_ENSG00000171448", "FDX2_ENSG00000267673",
"HSD3B7_ENSG00000099377", "RBFOX3_ENSG00000167281"), Pvalue = c(1.4e-300,
2.4e-150, 2.6089114579797e-07, 2.0296620694138e-06, 0.000147497259292417,
0.000229023886289315, 0.000245084674285079, 0.000256308708221289,
0.000261527824152563, 0.000288694716678695, 0.000290173032394758,
0.000320594572326915, 0.000346135729902497, 0.000355400110852,
0.000365256352980237, 0.000409731023356175, 0.000434204786603609,
0.000439775242591978, 0.000489192731765176, 0.000496753250110893,
0.00049911036273298, 0.000570787086811797, 0.000817460863988795,
0.000909350865229142, 0.000939159281654778, 0.00101875263711804,
0.00104161722087825, 0.00104642519111031, 0.0011025121215934,
0.00110797190460954, 0.00115516532029414, 0.00119237737210043,
0.00122886113380205, 0.00123316670384388, 0.00126924175390097,
0.00133083135434398, 0.00135900612361495, 0.00139601886941515,
0.00140034988031684, 0.00144667154281775, 0.00152488013161856,
0.00163920217629621, 0.00165121328565765, 0.00174281606991877,
0.00177541992540164, 0.00190567015024483, 0.00197012178338563,
0.00201154365191081, 0.00217761616500045, 0.00218849598206619,
0.00219107805420338, 0.00219952638949095, 0.0022100400174857,
0.00224988976742913, 0.00227842036080439, 0.00231351589815465,
0.00233840710255306, 0.00239368490047076, 0.00240800589782486,
0.00243072813003242, 0.00244930354205075, 0.00250643393459327,
0.00251262640919065, 0.00251308387281417, 0.00263512458389692,
0.00278748971622167, 0.00285692531240396, 0.00294631292976411,
0.0029855292366705, 0.00300042887433971, 0.00303321747691876,
0.00303431537337207, 0.00303655747990805, 0.00305247991142066,
0.00305779719421262, 0.0030773769185013, 0.00309595279588104,
0.00320602521859303, 0.00332374190234568, 0.00335845666631385,
0.00343476781423846, 0.00352132856036713, 0.0035370791144882,
0.00361921945446442, 0.00362829729460107, 0.00362925899436917,
0.00371857751928739, 0.00379170913533391, 0.00381786051662956,
0.00384603142808415, 0.0040621114920355, 0.00409131954647834,
0.00421076475281379, 0.00426968726537658, 0.00434706101829539,
0.00440972006588558, 0.00441860470852284, 0.00442578968523244,
0.00442716922579578, 0.00452215526426547, 0.00455658711791962,
0.00456768818316559, 0.00459525378983388, 0.00470562811526665,
0.00479427416502232, 0.00480697291736709, 0.00487609777383424,
0.00487626066774249, 0.0048982035968409, 0.00495106368869058,
0.00495974901689888, 0.0051182254688722, 0.00511868853158659,
0.00517459699358158, 0.0051863728177568, 0.0052533748441207,
0.0053048513357663, 0.00535144603215779, 0.00536294574878726,
0.00551084451782391, 0.00554884846488313, 0.0057184975334863,
0.00579274777888456, 0.00589230566622367, 0.00598698264647979,
0.00611781183554826, 0.00620691435617104, 0.00623285869674561,
0.00627192651777919, 0.00631120768525961, 0.00638288332792991,
0.00640000445930411, 0.00640676243762089, 0.00651734394089964,
0.0065624463096069, 0.00663922011120555, 0.00664879787639161,
0.00670461778135323, 0.00687266504207529, 0.00695679654393111,
0.00703352727799, 0.0070826001238915, 0.00709135444023445, 0.007142701991454,
0.00715597471729579, 0.00717318609326256, 0.00717726401691021,
0.00723420182380741, 0.00734437099984853), CHR = c(16L, 4L, 4L,
1L, 14L, 16L, 5L, 6L, 20L, 9L, 9L, 7L, 22L, 3L, 14L, 3L, 8L,
8L, 21L, 16L, 4L, 16L, 12L, 14L, 4L, 1L, 12L, 15L, 5L, 4L, 21L,
22L, 1L, 1L, 14L, 6L, 15L, 9L, 20L, 20L, 17L, 7L, 15L, 6L, 20L,
7L, 8L, 9L, 1L, 13L, 11L, 12L, 4L, 7L, 20L, 12L, 7L, 5L, 12L,
21L, 5L, 8L, 14L, 9L, 10L, 17L, 21L, 19L, 4L, 21L, 18L, 21L,
7L, 12L, 21L, 2L, 15L, 7L, 14L, 15L, 4L, 12L, 5L, 14L, 21L, 8L,
21L, 15L, 18L, 12L, 11L, 20L, 2L, 22L, 14L, 17L, 3L, 4L, 14L,
15L, 9L, 7L, 20L, 15L, 18L, 15L, 19L, 13L, 15L, 6L, 7L, 8L, 3L,
4L, 21L, 7L, 18L, 4L, 13L, 16L, 14L, 22L, 2L, 2L, 6L, 16L, 15L,
8L, 7L, 19L, 13L, 6L, 21L, 8L, 18L, 22L, 19L, 21L, 16L, 2L, 4L,
5L, 15L, 6L, 3L, 21L, 15L, 4L, 11L), POS = c(40665L, 197088L,
107291L, 210681L, 43546L, 79324L, 84342L, 184478L, 153093L, 180926L,
186110L, 117933L, 40682L, 54752L, 42758L, 61354L, 60378L, 157811L,
154466L, 126398L, 31037L, 115113L, 151914L, 10177L, 149587L,
79681L, 199754L, 129963L, 127032L, 175940L, 213708L, 51165L,
2584L, 166487L, 56259L, 130923L, 89219L, 170034L, 178967L, 102826L,
16982L, 188528L, 185007L, 6373L, 23298L, 199514L, 10429L, 58720L,
124518L, 210323L, 52212L, 186662L, 166963L, 58802L, 97157L, 14448L,
205795L, 70401L, 41824L, 93825L, 107954L, 207638L, 58648L, 64942L,
184005L, 19239L, 326L, 167713L, 106774L, 9145L, 174348L, 116079L,
38916L, 561L, 140433L, 123765L, 92497L, 187902L, 32027L, 63696L,
141286L, 67825L, 131698L, 120443L, 72621L, 165143L, 188862L,
52376L, 16769L, 77430L, 38655L, 145317L, 188469L, 113143L, 198322L,
26732L, 165043L, 25287L, 72392L, 12505L, 134208L, 126649L, 86308L,
199525L, 204348L, 103538L, 78610L, 176290L, 175950L, 73590L,
148494L, 151769L, 135252L, 141200L, 73351L, 45244L, 136493L,
33343L, 11165L, 915L, 80714L, 164700L, 142935L, 137224L, 554L,
92823L, 143083L, 166581L, 121459L, 19037L, 325L, 59959L, 155468L,
20896L, 33721L, 4468L, 113639L, 17103L, 184481L, 164337L, 174760L,
96405L, 207423L, 46590L, 168811L, 205743L, 74180L, 178456L, 126892L
)), row.names = c(NA, -149L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x55a80de817a0>)
In reality there are around 20,000 lines for each gene in the human genome.
Using qqman, one uses:
manhttahn(gwas_data...)
To get the plot.
I would like the same plot but with the axis broken between 8-149 and then again from 149-300 so that the bottom part isn't all compressed. qqman is unable to do this.
I have tried modifying the script from this website: https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/
And my code looks like this:
table above: gwas_data
data_cum <- gwas_data %>%
group_by(CHR) %>%
summarise(max_bp = max(BP)) %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>%
select(CHR, bp_add)
gwas_data <- gwas_data %>%
inner_join(data_cum, by = "CHR") %>%
mutate(bp_cum = bp + bp_add)
axis_set <- gwas_data %>%
group_by(CHR) %>%
summarize(center = mean(bp_cum))
ylim <- gwas_data %>%
filter(P == min(P)) %>%
mutate(ylim = abs(floor(log10(P))) + 2) %>%
pull(ylim)
sig <- 0.05/length(gwas_data$P) #this is a bonferroni correction
manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(P),
color = as_factor(CHR), size = -log10(P))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(p)") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)
This gives me:
Which is wrong. However, if I try and then cut the axis using the ggbreak package with:
t <- manhplot +scale_y_cut(break=c(10,140))
t+ scale_y_cut(break=c(140,300))
Which gives me:
How would I sort the chromosome x-axis and the breaks out so it looks like the qqman plot but with the y-axis compressed?
Many thanks
I need some help regarding transforming a geom_bar into a geom_area plot. This is my df:
dput(df)
df <- structure(list(new_day = c(-25L, 3L, 7L, -7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L,
-25L, 3L, 7L, -7L, 0L, 3L, -7L, 0L, -25L, 7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, 3L, 7L, -7L, 0L, -25L, 3L,
7L, -7L, 0L, 7L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -25L, -25L, -25L,
-25L, -25L, -25L, -25L), order = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 11L, 11L, 11L, 11L, 11L, 13L, 13L, 13L, 13L,
13L, 10L, 10L, 10L, 10L, 10L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L,
2L, 2L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L,
9L, 9L, 9L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 13L, 13L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 1L, 7L, 5L, 2L,
12L, 2L, 2L), .Label = c("Alteromonadales", "Betaproteobacteriales",
"Caulobacterales", "Chitinophagales", "Flavobacteriales", "Parvibaculales",
"Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodospirillales",
"Sneathiellales", "Sphingobacteriales", "Sphingomonadales", "Thalassobaculales"
), class = "factor"), family = structure(c(13L, 13L, 13L, 13L,
12L, 12L, 12L, 12L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 7L, 7L, 7L, 7L, 7L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L,
1L, 1L, 11L, 11L, 11L, 11L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 14L, 14L, 14L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 16L,
16L, 17L, 17L, 17L, 17L, 8L, 8L, 8L, 8L, 8L, 5L, 5L, 5L, 16L,
16L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L, 8L,
8L, 10L, 10L, 6L, 11L, 3L, 1L, 9L, 1L, 1L), .Label = c("Burkholderiaceae",
"Chitinophagaceae", "Flavobacteriaceae", "Gallaecimonadaceae",
"Hyphomonadaceae", "Idiomarinaceae", "Magnetospiraceae", "Methylophilaceae",
"NS11-12_marine_group", "Parvibaculaceae", "Pseudomonadaceae",
"Rhizobiaceae", "Rhizobiales_unclassified", "Rhodobacteraceae",
"Sneathiellaceae", "Sphingomonadaceae", "Thalassobaculaceae"), class = "factor"),
genus = structure(c(16L, 16L, 16L, 16L, 7L, 7L, 7L, 7L, 3L,
3L, 3L, 3L, 3L, 19L, 19L, 19L, 19L, 19L, 24L, 24L, 24L, 24L,
24L, 14L, 14L, 14L, 14L, 14L, 17L, 17L, 17L, 17L, 17L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 5L, 5L, 5L, 5L, 5L,
10L, 10L, 10L, 2L, 2L, 2L, 2L, 2L, 22L, 22L, 22L, 20L, 20L,
23L, 23L, 23L, 23L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L,
21L, 21L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 11L, 11L,
11L, 11L, 11L, 13L, 13L, 9L, 14L, 4L, 6L, 12L, 1L, 18L), .Label = c("Burkholderiaceae_unclassified",
"Cupriavidus", "Ferrovibrio", "Flavobacteriaceae_unclassified",
"Gallaecimonas", "GKS98_freshwater_group", "Hoeflea", "Hyphomonas",
"Idiomarina", "Marivivens", "Methylotenera", "NS11-12_marine_group_ge",
"Parvibaculum", "Pseudomonas", "Pseudorhodobacter", "Rhizobiales_unclassified",
"Rhodoferax", "RS62_marine_group", "Sphingomonadaceae_unclassified",
"Sphingopyxis", "Sphingorhabdus", "Terrimonas", "Thalassobaculum",
"uncultured"), class = "factor"), Abundance = c(0.758296593899054,
0.728046713738242, 0.421798852637834, 0.185971692147469,
7.36584152568739, 11.0004160226707, 1.93134577450352, 19.7144376530921,
46.2350237547082, 25.8715062086956, 22.1549641486618, 34.4112477828867,
20.4937613394223, 3.73518219692229, 15.9295990367068, 13.8490383262387,
13.3481723220855, 20.3866145291388, 0.165618346100574, 8.86991024549668,
8.5330814375361, 6.86819004205197, 5.72129192186814, 1.04512973253723,
3.77880217461655, 6.47871112880127, 1.12084852451492, 0.903754246093232,
19.0854333497858, 15.7152146349298, 12.3768753373503, 15.8790763239117,
10.2875187327705, 2.82159106304821, 4.22393981370602, 8.82452898193968,
4.8507226701533, 6.19619716749583, 8.28477594908417, 8.05201189383953,
9.7404731686272, 9.84535225459449, 1.7940554465653, 2.62276259756813,
2.74008811315788, 0.543937440677315, 0.55325167765205, 0.910457573040239,
0.451385497886567, 0.655661306732001, 6.59400178917785, 1.92570846362683,
2.62192443054515, 2.10049053655497, 2.13139299576524, 0.20799245164738,
0.324291631088576, 0.369492771993701, 1.52162438803598, 0.151864202275619,
0.420953084533189, 0.391517677365401, 0.29116200940885, 0.232440441774702,
4.21428798609281, 0.859779996836882, 1.33107018783728, 1.013155122065,
0.447286602320585, 0.165001492967355, 0.285983094976304,
0.377758692391269, 0.21556919104275, 0.314057858254493, 0.354649793637887,
0.338799824269294, 0.218027624939685, 0.914324162324944,
1.22932824654674, 0.731649603629864, 0.566393265064962, 0.247942012186621,
1.73171328618728, 0.636597714441988, 0.505393049999761, 0.491318560043637,
0.990988961717433, 0.195417142399681, 0.210412739808352,
0.476107780140271, 0.936663899397428, 0.251540964619117,
0.963667386912928, 0.504905545701818, 0.296220086916766,
0.240809811677774)), class = "data.frame", row.names = c(52L,
68L, 72L, 93L, 165L, 169L, 190L, 194L, 246L, 262L, 266L, 287L,
291L, 343L, 359L, 363L, 384L, 388L, 440L, 456L, 460L, 481L, 485L,
634L, 650L, 654L, 675L, 679L, 731L, 747L, 751L, 772L, 776L, 844L,
848L, 869L, 873L, 925L, 941L, 945L, 966L, 970L, 1022L, 1038L,
1042L, 1063L, 1067L, 1216L, 1232L, 1236L, 1313L, 1329L, 1333L,
1354L, 1358L, 1426L, 1451L, 1455L, 1507L, 1527L, 1717L, 1721L,
1742L, 1746L, 2186L, 2202L, 2206L, 2227L, 2231L, 2380L, 2396L,
2400L, 3075L, 3079L, 3294L, 3298L, 3350L, 3366L, 3370L, 3391L,
3395L, 3467L, 4223L, 4239L, 4243L, 4264L, 4268L, 4433L, 4437L,
4708L, 4805L, 4902L, 5193L, 5969L, 7909L, 8006L))
and this is the structure:
> str(df)
'data.frame': 96 obs. of 5 variables:
$ new_day : int -25 3 7 -7 3 7 -7 0 -25 3 ...
$ order : Factor w/ 14 levels "Alteromonadales",..: 8 8 8 8 8 8 8 8 11 11 ...
$ family : Factor w/ 17 levels "Burkholderiaceae",..: 13 13 13 13 12 12 12 12 15 15 ...
$ genus : Factor w/ 24 levels "Burkholderiaceae_unclassified",..: 16 16 16 16 7 7 7 7 3 3 ...
$ Abundance: num 0.758 0.728 0.422 0.186 7.366 ...
my data is about relative abundances of species over time, I removed rare species so it doesn't add up to 100 % anymore,
but that is fine, it is about 98 % per date. However, I get these weird free polygons and triangles which I recognize from incorrect grouping etc., but the group parameter did not change anything here. I also tried several position and stat arguments, which did not help. Maybe it is about the order of factors or something?
What I'm looking for is a stacked plot of the abundances of cumulated orders without empty spaces in between etc. Create proportional geom_area plot directly in ggplot2
# area plot combining species on order level
ggplot(df, aes(x = new_day, y = Abundance, fill = order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
I get fewer weird shapes when going to a more detailed hierarchical level (genus instead of order)
# area plot on genus level
ggplot(df, aes(x = new_day, y = Abundance, fill = genus)) +
geom_area(stat = "identity", position = "stack") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
but these are still more blank areas than there should be by the sum of abundances for a given time
# total abundance per day
sum(subset(df, new_day == -25)$Abundance)
[1] 98.03997
Any suggestions on how to fix this?
The problem is that you sometimes have several abundance values for one new_day, even with more detailed hierarchical levels.
This is what creates discontinuities in the area plot. You need to have only one unique value for each new_day. In my example below, I just take the first abundance value after grouping by new_day and order, but it is probably not relevant for what you want to show. (You may want to take the mean or attributes these values to other new_day points in between, whatever you need).
The remaining little gaps are caused by the missing abundance values, since as you said, it does not add up to 100%. This is not a big deal, but you can probably fix it by replacing the missing values by 0.
EDIT : Now doing the sum of abundance values as you mentioned, and removing the small remaining gaps by replacing missing values by 0.
library(tidyverse)
df %>%
# Sum abundance values, to only keep one per point
group_by(new_day, order) %>%
summarise(abundance=sum(Abundance)) %>%
ungroup() %>%
# Replace missing values by 0
spread(key=order, value=abundance) %>%
gather(key=order, value=abundance, -new_day) %>%
replace_na(list(abundance=0)) -> data
ggplot(data, aes(x = new_day, y = abundance, fill=order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
Using arules, I have got two itemsets, and I want to do subtraction between the two different itemsets when having same items.
> inspect(fsets_model_test)
items support count
[1] {SURFSKINTEMP=6,MODIS_LST=1} 0.01235235 663
[2] {TOTCO=13,MODIS_LST=1} 0.01373104 737
[3] {TOTCO=6,MODIS_LST=1} 0.01393598 748
[4] {TOTO3=15,MODIS_LST=1} 0.01265045 679
[5] {TOTH2OVAP=6,MODIS_LST=1} 0.01548236 831
[6] {TOTH2OVAP=1,MODIS_LST=1} 0.01565004 840
> inspect(fsets_nonsesmic_test)
items support count
[1] {TOTCO=6,MODIS_LST=1} 0.02192761 10013
[2] {TOTCO=13,MODIS_LST=1} 0.02261524 10327
[3] {TOTO3=15,MODIS_LST=1} 0.02432556 11108
[4] {SURFAIRTEMP=3,TOTH2OVAP=1,MODIS_LST=1} 0.01772735 8095
[5] {TOTH2OVAP=1,MODIS_LST=1} 0.02873605 13122
[6] {SURFAIRTEMP=3,TOTH2OVAP=1} 0.01856828 8479
you can see that itemsets fsets_model_test and itemsets fsets_nonsesmic_test have same items {TOTO3=15,MODIS_LST=1}
What I want to do is subtract support between two itemsets, in above case is
0.02432556 - 0.01265045 = 0.01167511, and then get a new itemsets.
How to implement this in arules, thanks
following are the example itemsets
one itemsets
fsets_model_test <- new("itemsets"
, items = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(5L, 121L, 74L, 121L, 67L, 121L, 59L, 121L, 33L, 121L, 28L,
121L)
, p = c(0L, 2L, 4L, 6L, 8L, 10L, 12L)
, Dim = c(125L, 6L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = c("SURFSKINTEMP=1", "SURFSKINTEMP=2",
"SURFSKINTEMP=3", "SURFSKINTEMP=4", "SURFSKINTEMP=5", "SURFSKINTEMP=6",
"SURFSKINTEMP=7", "SURFSKINTEMP=8", "SURFSKINTEMP=9", "SURFSKINTEMP=10",
"SURFSKINTEMP=11", "SURFSKINTEMP=12", "SURFSKINTEMP=13", "SURFSKINTEMP=14",
"SURFSKINTEMP=15", "SURFSKINTEMP=16", "SURFAIRTEMP=1", "SURFAIRTEMP=2",
"SURFAIRTEMP=3", "SURFAIRTEMP=4", "SURFAIRTEMP=5", "SURFAIRTEMP=6",
"SURFAIRTEMP=7", "SURFAIRTEMP=8", "SURFAIRTEMP=9", "SURFAIRTEMP=10",
"SURFAIRTEMP=11", "SURFAIRTEMP=12", "TOTH2OVAP=1", "TOTH2OVAP=2",
"TOTH2OVAP=3", "TOTH2OVAP=4", "TOTH2OVAP=5", "TOTH2OVAP=6", "TOTH2OVAP=7",
"TOTH2OVAP=8", "TOTH2OVAP=9", "TOTH2OVAP=10", "TOTH2OVAP=11",
"TOTH2OVAP=12", "TOTH2OVAP=13", "TOTH2OVAP=14", "TOTH2OVAP=15",
"TOTH2OVAP=16", "TOTH2OVAP=17", "TOTO3=1", "TOTO3=2", "TOTO3=3",
"TOTO3=4", "TOTO3=5", "TOTO3=6", "TOTO3=7", "TOTO3=8", "TOTO3=9",
"TOTO3=10", "TOTO3=11", "TOTO3=12", "TOTO3=13", "TOTO3=14", "TOTO3=15",
"TOTO3=16", "TOTO3=17", "TOTCO=1", "TOTCO=2", "TOTCO=3", "TOTCO=4",
"TOTCO=5", "TOTCO=6", "TOTCO=7", "TOTCO=8", "TOTCO=9", "TOTCO=10",
"TOTCO=11", "TOTCO=12", "TOTCO=13", "TOTCO=14", "TOTCO=15", "TOTCH4=1",
"TOTCH4=2", "TOTCH4=3", "TOTCH4=4", "TOTCH4=5", "TOTCH4=6", "TOTCH4=7",
"TOTCH4=8", "TOTCH4=9", "TOTCH4=10", "TOTCH4=11", "TOTCH4=12",
"TOTCH4=13", "TOTCH4=14", "OLR_ARIS=1", "OLR_ARIS=2", "OLR_ARIS=3",
"OLR_ARIS=4", "OLR_ARIS=5", "OLR_ARIS=6", "OLR_ARIS=7", "OLR_ARIS=8",
"OLR_ARIS=9", "OLR_ARIS=10", "CLROLR_ARIS=1", "CLROLR_ARIS=2",
"CLROLR_ARIS=3", "CLROLR_ARIS=4", "CLROLR_ARIS=5", "CLROLR_ARIS=6",
"CLROLR_ARIS=7", "CLROLR_ARIS=8", "CLROLR_ARIS=9", "CLROLR_ARIS=10",
"OLR_NOAA=1", "OLR_NOAA=2", "OLR_NOAA=3", "OLR_NOAA=4", "OLR_NOAA=5",
"OLR_NOAA=6", "OLR_NOAA=7", "OLR_NOAA=8", "OLR_NOAA=9", "OLR_NOAA=10",
"MODIS_LST=1", "MODIS_LST=2", "MODIS_LST=3", "MODIS_LST=4"),
variables = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L), .Label = c("CLROLR_ARIS", "MODIS_LST", "OLR_ARIS", "OLR_NOAA",
"SURFAIRTEMP", "SURFSKINTEMP", "TOTCH4", "TOTCO", "TOTH2OVAP",
"TOTO3"), class = "factor"), levels = structure(c(1L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L,
1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 1L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 2L, 1L, 10L, 11L, 12L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -125L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, tidLists = NULL
, quality = structure(list(support = c(0.0123523493684093, 0.0137310429630734,
0.0139359839028207, 0.0126504452807691, 0.0154823564481872, 0.0156500353988896
), count = c(663, 737, 748, 679, 831, 840)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
, info = structure(list(data = model_data_tr, ntransactions = 53674L,
support = 0.01), .Names = c("data", "ntransactions", "support"
))
)
another itemsets is:
fsets_nonsesmic_test <- new("itemsets"
, items = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(67L, 121L, 74L, 121L, 59L, 121L, 18L, 28L, 121L, 28L, 121L,
18L, 28L)
, p = c(0L, 2L, 4L, 6L, 9L, 11L, 13L)
, Dim = c(125L, 6L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = c("SURFSKINTEMP=1", "SURFSKINTEMP=2",
"SURFSKINTEMP=3", "SURFSKINTEMP=4", "SURFSKINTEMP=5", "SURFSKINTEMP=6",
"SURFSKINTEMP=7", "SURFSKINTEMP=8", "SURFSKINTEMP=9", "SURFSKINTEMP=10",
"SURFSKINTEMP=11", "SURFSKINTEMP=12", "SURFSKINTEMP=13", "SURFSKINTEMP=14",
"SURFSKINTEMP=15", "SURFSKINTEMP=16", "SURFAIRTEMP=1", "SURFAIRTEMP=2",
"SURFAIRTEMP=3", "SURFAIRTEMP=4", "SURFAIRTEMP=5", "SURFAIRTEMP=6",
"SURFAIRTEMP=7", "SURFAIRTEMP=8", "SURFAIRTEMP=9", "SURFAIRTEMP=10",
"SURFAIRTEMP=11", "SURFAIRTEMP=12", "TOTH2OVAP=1", "TOTH2OVAP=2",
"TOTH2OVAP=3", "TOTH2OVAP=4", "TOTH2OVAP=5", "TOTH2OVAP=6", "TOTH2OVAP=7",
"TOTH2OVAP=8", "TOTH2OVAP=9", "TOTH2OVAP=10", "TOTH2OVAP=11",
"TOTH2OVAP=12", "TOTH2OVAP=13", "TOTH2OVAP=14", "TOTH2OVAP=15",
"TOTH2OVAP=16", "TOTH2OVAP=17", "TOTO3=1", "TOTO3=2", "TOTO3=3",
"TOTO3=4", "TOTO3=5", "TOTO3=6", "TOTO3=7", "TOTO3=8", "TOTO3=9",
"TOTO3=10", "TOTO3=11", "TOTO3=12", "TOTO3=13", "TOTO3=14", "TOTO3=15",
"TOTO3=16", "TOTO3=17", "TOTCO=1", "TOTCO=2", "TOTCO=3", "TOTCO=4",
"TOTCO=5", "TOTCO=6", "TOTCO=7", "TOTCO=8", "TOTCO=9", "TOTCO=10",
"TOTCO=11", "TOTCO=12", "TOTCO=13", "TOTCO=14", "TOTCO=15", "TOTCH4=1",
"TOTCH4=2", "TOTCH4=3", "TOTCH4=4", "TOTCH4=5", "TOTCH4=6", "TOTCH4=7",
"TOTCH4=8", "TOTCH4=9", "TOTCH4=10", "TOTCH4=11", "TOTCH4=12",
"TOTCH4=13", "TOTCH4=14", "OLR_ARIS=1", "OLR_ARIS=2", "OLR_ARIS=3",
"OLR_ARIS=4", "OLR_ARIS=5", "OLR_ARIS=6", "OLR_ARIS=7", "OLR_ARIS=8",
"OLR_ARIS=9", "OLR_ARIS=10", "CLROLR_ARIS=1", "CLROLR_ARIS=2",
"CLROLR_ARIS=3", "CLROLR_ARIS=4", "CLROLR_ARIS=5", "CLROLR_ARIS=6",
"CLROLR_ARIS=7", "CLROLR_ARIS=8", "CLROLR_ARIS=9", "CLROLR_ARIS=10",
"OLR_NOAA=1", "OLR_NOAA=2", "OLR_NOAA=3", "OLR_NOAA=4", "OLR_NOAA=5",
"OLR_NOAA=6", "OLR_NOAA=7", "OLR_NOAA=8", "OLR_NOAA=9", "OLR_NOAA=10",
"MODIS_LST=1", "MODIS_LST=2", "MODIS_LST=3", "MODIS_LST=4"),
variables = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L), .Label = c("CLROLR_ARIS", "MODIS_LST", "OLR_ARIS", "OLR_NOAA",
"SURFAIRTEMP", "SURFSKINTEMP", "TOTCH4", "TOTCO", "TOTH2OVAP",
"TOTO3"), class = "factor"), levels = structure(c(1L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L,
1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 1L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 2L, 1L, 10L, 11L, 12L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -125L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, tidLists = NULL
, quality = structure(list(support = c(0.0219276058330541, 0.0226152387334415,
0.024325561329628, 0.0177273513650827, 0.0287360475123675, 0.0185682782241552
), count = c(10013, 10327, 11108, 8095, 13122, 8479)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
, info = structure(list(data = nonsesmic_data_tr, ntransactions = 456639L,
support = 0.01), .Names = c("data", "ntransactions", "support"
))
)
If the two sets come from transaction data that are compatible (see ? itemCoding) then you can use match to find matching itemsets in the two sets. After that, it should be easy to subtract the support.
I have a data set with 4 columns, 2 of which are numeric, 1 is categorical and 1 is the label. The label has 13 levels (A to M). I tried to use knncat package in R to do classification, but every time I ran the code, I got the following error message:
Error in `[<-.data.frame`(`*tmp*`, factor.vars, value = c("M", "J", "K", :
replacement has 45500 rows, data has 1
The following is the code I used:
data <- read.csv('mosaic_data2.csv', header = T)
num <- dim(data)[1]
library(sampling)
set.seed(1234)
train_index <- sample(seq(1,num,1), floor(num * 0.7), replace = F)
test_index <- setdiff(seq(1,num,1), train_index)
train_data <- data[train_index,]
test_data <- data[test_index,]
library(knncat)
model <- knncat(train_data, classcol = 2)
Could anyone please take a look at the code and advise how I could eliminate this bug? Thank you very much!
The output of dput(head(data,100)) is as follows:
structure(list(latitude = c(52.7326028, 52.74287543, 52.82107841,
52.82025363, 52.81980596, 52.81721897, 52.81274172, 52.81274172,
52.8089586, 52.81424219, 52.8089586, 52.74007929, 52.77394023,
52.73659034, 52.73672518, 52.73764626, 52.73753744, 52.73659034,
52.73815233, 52.73679388, 52.73890319, 52.71697237, 52.63730282,
52.62720385, 52.63730282, 52.63543017, 52.63768035, 52.63510366,
52.6346578, 52.6346578, 52.6346578, 52.63447454, 52.63576418,
52.63447454, 52.6346578, 52.63447454, 52.69820719, 52.69603926,
52.68246919, 52.54600173, 52.54210198, 52.60628983, 52.61003275,
52.60278236, 52.60239604, 52.60348688, 52.60239604, 52.60382146,
52.60315644, 52.86047938, 52.86576353, 52.86954228, 52.81039471,
52.82094872, 52.82395073, 52.82444705, 52.88098384, 52.88469208,
52.88469208, 52.84979201, 52.84720159, 52.84831759, 52.82435938,
52.82319493, 52.82168337, 52.8230402, 52.8230402, 52.82513486,
52.82472379, 52.82756385, 52.82475438, 52.82434902, 52.82166611,
52.823712, 52.82401481, 52.82483489, 52.82103704, 52.82060763,
52.8208682, 52.82211317, 52.81868547, 52.8198332, 52.82023595,
52.81989134, 52.8196971, 52.82051066, 52.82463338, 52.82539131,
52.82580625, 52.82509199, 52.83759415, 52.83946254, 52.83946254,
52.83891871, 52.83821538, 52.84757879, 52.84663773, 52.8449371,
52.84592185, 52.84331619), longitude = c(-6.892397941, -6.915346343,
-6.922554014, -6.924997835, -6.926099967, -6.883340697, -6.897757597,
-6.897757597, -6.895500952, -6.883129556, -6.895500952, -6.703781864,
-6.680851783, -6.771845364, -6.773301282, -6.772958488, -6.77484647,
-6.771845364, -6.773422218, -6.772164896, -6.770622695, -6.784187251,
-6.901922588, -6.905109015, -6.901922588, -6.976679508, -6.973114498,
-6.974753462, -6.947990431, -6.947990431, -6.947990431, -6.976921427,
-6.958295227, -6.976921427, -6.947990431, -6.976921427, -6.902010609,
-6.915233457, -6.871160885, -6.832461149, -6.862126342, -6.943925285,
-6.93813643, -6.925128034, -6.932247524, -6.93461305, -6.932247524,
-6.934657053, -6.929283954, -6.845259603, -6.861188287, -6.866476268,
-6.940851164, -6.939203401, -6.930506188, -6.933317462, -6.929441954,
-6.922589037, -6.922589037, -6.926037258, -6.929423169, -6.917829279,
-6.938211918, -6.940658091, -6.940651748, -6.940107883, -6.940107883,
-6.938704642, -6.939084526, -6.933331264, -6.937496468, -6.937678962,
-6.940276221, -6.94018054, -6.939876475, -6.938983181, -6.934235666,
-6.93387209, -6.933134226, -6.934193569, -6.934383596, -6.933832641,
-6.937454656, -6.933818238, -6.93443811, -6.936913947, -6.920030341,
-6.920400963, -6.92215006, -6.910771124, -6.901500591, -6.899018998,
-6.899018998, -6.903007684, -6.90119821, -6.91063672, -6.909935672,
-6.90240965, -6.900066763, -6.901411136), mosaic_group = structure(c(10L,
10L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 10L, 10L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 10L, 10L, 10L, 13L, 13L, 13L, 13L,
9L, 6L, 6L, 6L, 6L, 6L, 10L, 8L, 8L, 9L, 9L, 9L, 9L, 7L, 7L,
7L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 8L, 8L, 8L, 8L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 8L,
6L, 6L, 6L, 6L, 6L, 8L, 8L, 10L, 10L, 10L), .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"), class = "factor"),
small_code = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 14L, 14L, 14L, 14L, 14L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 28L, 28L, 28L
)), .Names = c("latitude", "longitude", "mosaic_group", "small_code"
), row.names = c(NA, 100L), class = "data.frame")
The function knncat::knncat accepts the argument classcol which is defined as:
Column with classification in it. Default: 1.
You have a data set with structure:
latitude longitude mosaic_group small_code
1 52.73260 -6.892398 J 1
2 52.74288 -6.915346 J 1
3 52.82108 -6.922554 H 2
4 52.82025 -6.924998 H 2
5 52.81981 -6.926100 H 2
6 52.81722 -6.883341 G 3
Therefore your argument should be classcol = 3 (or 4) I am assuming, but we can see that it certainly shouldn't be classcol = 2.
I am trying to manually change the color of only the first item of a legend in a ggplot2 line plot.
I have several observations of a variable that I am displaying in a line plot, just like this:
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable ) ) +
geom_line(size=1) + geom_point(size=2.5) + theme_grey(base_size = 18) +
xlab(NULL) + ylab('%') + theme(legend.title = element_blank()) + theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") + theme(plot.title = element_text(hjust = 0))
However, the first variable (CRU) is my reference and I would like to show its line in black. I managed to do this by adding one extra geom_line with the condition variable=='CRU':
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable ) ) +
geom_line(size=1) + geom_point(size=2.5) + theme_grey(base_size = 18) +
geom_line(data=subset(tmp1, variable == "CRU"), colour="black", linetype="solid", size=1) +
geom_point(data=subset(tmp1, variable == "CRU"), colour="black", size=2.5) +
xlab(NULL) + ylab('%') + theme(legend.title = element_blank()) + theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") + theme(plot.title = element_text(hjust = 0))
which works for the line, but the legend keeps the old colour.
How can I change the color of just the first element of the legend, in order to match the new black line?
This is an example of my data:
library(ggplot2)
tmp1 <- structure(list(month = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("Jan", "Feb", "Mar",
"Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L
), .Label = c("CRU", "CanESM2", "GFDL-ESM2M", "GISS-E2-H", "GISS-E2-R-CC",
"GISS-E2-R", "HadGEM2-AO", "HadGEM2-CC", "IPSL-CM5A-MR", "IPSL-CM5B-LR",
"MIROC4h", "MRI-CGCM3", "NorESM1-M", "bcc-csm1-1-m", "bcc-csm1-1",
"inmcm4"), class = "factor"), value = c(68.9226631460789, 68.2418796877392,
68.3045372212868, 66.5727907036073, 64.278360290491, 60.6452267972856,
56.4079999829923, 57.4384828307567, 60.874295882443, 63.70427487797,
65.9934520468731, 68.9723871966257, 69.0959015590216, 68.6126351492122,
65.9106136896166, 65.790169283913, 64.6320994816801, 63.894111784301,
62.0459530253135, 60.0455773681386, 59.4195693791228, 59.8531302282566,
62.8877658601921, 66.4625078340445, 63.4659654507164, 64.5466810117518,
63.6412932878715, 61.5786848043378, 60.6491980933614, 63.5160886052168,
62.739218138279, 60.8826348052995, 60.1196738813257, 59.0451443027396,
58.9044684656519, 61.5033887899156, 62.442928703121, 61.9933297554931,
61.686560285787, 62.1675956585161, 63.0625380934021, 63.3192922622326,
62.6727899590586, 60.9706714311941, 59.4656895840826, 59.8689092461429,
60.7585523645951, 62.2374164636759, 62.2586495696979, 62.3005886556949,
62.0719314334763, 61.7786313583016, 62.1037020616999, 62.5919637033876,
60.7746642298107, 58.7307471416832, 57.6602849809809, 57.3379551651851,
59.8210398283061, 61.5997238276034, 62.1190176575675, 62.2214930174241,
61.607539296931, 61.836536870373, 61.8298589429815, 62.0478835210295,
60.8165122782774, 59.224498365607, 57.5387307267022, 56.8641846144649,
59.6779581588162, 61.5822371331742, 56.9625864272884, 55.0519081266715,
53.9161532646461, 52.0847886852487, 54.1855963059705, 54.1565901942167,
53.8164314129289, 53.3013959169719, 52.1283494730607, 49.9814907883562,
51.0053330490513, 54.1758812796363, 54.1947459143536, 53.2985061657513,
51.5351727215781, 51.2131541342776, 53.040182168441, 53.4657505459587,
52.8257974728027, 52.8523832284788, 51.2527233914323, 48.0999294191007,
48.3915726340961, 50.9305288780026, 65.3647375158419, 64.6894843930494,
62.2700707798592, 60.2848148985731, 59.0797813854392, 58.6641353922813,
60.36671822738, 61.0883458866571, 60.3963355506111, 60.989444946264,
62.1570976843054, 64.0549504714623, 63.043822206253, 61.5388900651697,
61.0125502971802, 60.4999006674972, 60.9554692113674, 61.2665703834057,
61.1470225339614, 61.4827838311531, 60.0397138517742, 61.6503963603034,
62.7421837830534, 63.9911949044232, 55.7117557057576, 55.0687784028633,
51.7447044604762, 50.5160095376821, 51.7744811245234, 52.6710116909617,
52.9126480516047, 51.6347065362984, 50.6773480024225, 48.8928054774924,
50.3505731163001, 53.7488684714513, 61.558109087334, 61.6673093977654,
61.008465555097, 58.5478578294864, 57.4119260976748, 57.9275733769477,
56.9129774651439, 55.6494927089111, 52.0222406797903, 51.9215916366208,
53.4679949695072, 58.2128251869788, 64.7955701998493, 62.8319013929061,
60.8391061131818, 56.1759467734789, 55.4331550199683, 55.8437923896573,
54.998540828777, 54.7840203124691, 54.3853750266133, 52.7590435522892,
56.1409799671355, 62.0047140533332, 57.5185465474672, 57.2532289998115,
55.9911913829976, 54.6479285609432, 53.1659722964534, 53.3609799276622,
51.321452599498, 49.6933914680193, 48.6718229103421, 49.5393207890844,
52.8096091918065, 56.1667672797739, 60.7380412023987, 60.1791897430251,
58.7798069796932, 58.061108119255, 59.7770862278418, 60.2070273632675,
59.074898814382, 55.5571990297011, 53.8564792650491, 54.0753885029223,
56.2369958393563, 58.9062125901571, 70.7538119957697, 69.4271857400385,
67.3954189057409, 66.9262104442679, 67.1558044757422, 65.8848885390536,
65.3092556552615, 64.3799468889004, 64.9999333535186, 65.6493831700943,
69.2646980549075, 70.6342115226731)), row.names = c(NA, -192L
), .Names = c("month", "variable", "value"), class = "data.frame")
Instead of splitting up the data and plotting two geom_lines, you can simply supply a custom colour palette in which CRU is mapped to black.
If you want to keep the default colours for the other variables, you first need to define a little helper function to retrieve them the way ggplot2 does it.
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
Then create a custom colour palette vector, combining the standard palette and black. Since CRU is the first level of your factor variable (with 16 levels in total), this is simply
custom_palette <- c("#000000", gg_color_hue(15))
The following then produces your desired plot:
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable)) +
geom_line(size=1) +
geom_point(size=2.5) +
scale_colour_manual(values=custom_palette) +
theme_grey(base_size = 18) +
xlab(NULL) + ylab('%') +
theme(legend.title = element_blank()) +
theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") +
theme(plot.title = element_text(hjust = 0))