Formatting x-axis with histogram in R - r

I want x- axis from 1 to 20 and y-axis from 1 to 6.
My data:
structure(list(HEI.ID = structure(c(12L, 9L, 14L, 19L, 20L, 1L,
7L, 5L, 11L, 3L, 10L, 18L, 2L, 8L, 6L, 15L, 13L, 17L, 4L, 16L
), .Label = c("BF", "CC", "DC", "ER", "IM", "MC", "ME ",
"MM", "MO", "OC", "OM", "OP", "SB", "SD", "SH", "SL", "SN", "TH",
"UN", "WS"), class = "factor"), X2007 = c(18L, 14L, 15L, 20L,
12L, 6L, 17L, 2L, 4L, 11L, 16L, 1L, 9L, 8L, 13L, 4L, 10L, 6L,
3L, 19L), X2008 = c(20L, 9L, 16L, 18L, 8L, 17L, 15L, 6L, 3L,
14L, 19L, 1L, 2L, 12L, 5L, 13L, 11L, 7L, 4L, 10L), X2009 = c(20L,
13L, 17L, 8L, 4L, 9L, 19L, 12L, 2L, 11L, 16L, 1L, 2L, 7L, 6L,
18L, 5L, 15L, 9L, 14L), X2010 = c(20L, 13L, 16L, 13L, 7L, 15L,
19L, 8L, 3L, 9L, 18L, 1L, 5L, 11L, 12L, 6L, 10L, 4L, 2L, 17L),
X2011 = c(20L, 2L, 16L, 14L, 6L, 10L, 17L, 8L, 3L, 15L, 19L,
1L, 4L, 18L, 13L, 11L, 8L, 12L, 4L, 7L), X2012 = c(20L, 12L,
19L, 13L, 8L, 14L, 15L, 10L, 11L, 9L, 17L, 2L, 7L, 18L, 5L,
16L, 3L, 4L, 6L, 1L)), .Names = c("HEI.ID", "X2007", "X2008",
"X2009", "X2010", "X2011", "X2012"), row.names = c(NA, -20L), class = "data.frame")
I use the following commands to draw histograms:
par(mfrow = c(3,4))
for(i in 1:20){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlab = STOF[i,1],cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1)
}
But after using above commands, I get different number in x- axis and y-axis.

I don't understand what your plot would looks like. It's not clear from your question and data provided.
I've tried to plot it. Please comment if you think it's the way to go.
Considering dt is your data.frame
library(reshape)
dt <- melt(dt)
library(ggplot2)
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt) +
geom_bar(stat = 'identity')
or
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt1) +
geom_bar(stat = 'identity') +
facet_grid(variable ~.)

You could use xlim and ylim parameters in the hist function and control the axes using
axis:
par(mfrow = c(3,4))
for(i in 1:12){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlim=c(0, 21), ylim=c(0,6), xaxt='n', yaxt='n')
axis(1, at=c(0, 10, 20))
axis(2, at=0:6)
}
Do you really want your y-axis to go from 1 to 6? This will cut off parts of the bars.
Also, you iterate over all 20 rows for a grid with 12 plots. The code above gives the following plot:

Related

y-axis breaks with ggplot2 for a manhattan plot

I have a manhattan plot of genetic information:
It was generated using the qqman package https://cran.r-project.org/web/packages/qqman/vignettes/qqman.html) in R which takes a dataframe of P-values, chromosome position and a gene position (for any biologists reading, this is a per gene manhattan hence the sparsity of signal). The data looks like this (with an example dataset below:
SNP P CHR BP
ABC 1.1e-300 16 875849
AAS 1.2e-150 4 2343
JTL 4.2e-07 3 436544
LKS 4.1e-06 2 23565
JKSA 0.000432 1 98043
LKF 0.0032 22 387235
A20 0.0054 10 3252
AKLF 0.0235 4 4543543
structure(list(Gene = c("ABC1", "HGT2", "SLC34A3_ENSG00000198569",
"OR9K2_ENSG00000170605", "NFKB2_ENSG00000077150", "EFR3A_ENSG00000132294",
"SLC7A9_ENSG00000021488", "SEMG1_ENSG00000124233", "EWSR1_ENSG00000182944",
"ATP5PD_ENSG00000167863", "MAST3_ENSG00000099308", "KRT31_ENSG00000094796",
"FOXI1_ENSG00000168269", "CHCHD7_ENSG00000170791", "MAPK6_ENSG00000069956",
"SPRYD3_ENSG00000167778", "HOXB13_ENSG00000159184", "SLC12A9_ENSG00000146828",
"EXOC2_ENSG00000112685", "KCNJ15_ENSG00000157551", "SLC22A18_ENSG00000110628",
"ARID4A_ENSG00000032219", "SKP2_ENSG00000145604", "ZNF831_ENSG00000124203",
"ZNF275_ENSG00000063587", "SLC16A2_ENSG00000147100", "ADRB1_ENSG00000043591",
"DSCAM_ENSG00000171587", "PPM1H_ENSG00000111110", "IFNA14_ENSG00000228083",
"STX2_ENSG00000111450", "VPS54_ENSG00000143952", "ANXA7_ENSG00000138279",
"MAP3K12_ENSG00000139625", "MED13L_ENSG00000123066", "CHRM2_ENSG00000181072",
"RBP7_ENSG00000162444", "DRD1_ENSG00000184845", "CCDC121_ENSG00000176714",
"HMG20B_ENSG00000064961", "POU5F1B_ENSG00000212993", "SESN1_ENSG00000080546",
"DNASE1_ENSG00000213918", "FBXO24_ENSG00000106336", "RAG2_ENSG00000175097",
"UTS2_ENSG00000049247", "KMT2B_ENSG00000272333", "RBM33_ENSG00000184863",
"SNRPB2_ENSG00000125870", "FOXO4_ENSG00000184481", "NBPF3_ENSG00000142794",
"PPL_ENSG00000118898", "LYPD6B_ENSG00000150556", "POLD3_ENSG00000077514",
"PIK3CB_ENSG00000051382", "BCL2L12_ENSG00000126453", "CDC45_ENSG00000093009",
"DUXA_ENSG00000258873", "MCM3_ENSG00000112118", "CAPN3_ENSG00000092529",
"FMO4_ENSG00000076258", "B3GALT2_ENSG00000162630", "MICB_ENSG00000204516",
"CCL22_ENSG00000102962", "JKAMP_ENSG00000050130", "GSDME_ENSG00000105928",
"IZUMO4_ENSG00000099840", "NCKAP5L_ENSG00000167566", "ZRANB1_ENSG00000019995",
"TAL1_ENSG00000162367", "SLTM_ENSG00000137776", "SPC25_ENSG00000152253",
"GAP43_ENSG00000172020", "FGD3_ENSG00000127084", "PTCD3_ENSG00000132300",
"PAH_ENSG00000171759", "MMP8_ENSG00000118113", "RSBN1L_ENSG00000187257",
"AC026740.3_ENSG00000286094", "FAM189A2_ENSG00000135063", "TMEM245_ENSG00000106771",
"DDX50_ENSG00000107625", "SP140_ENSG00000079263", "C21orf91_ENSG00000154642",
"MEIKIN_ENSG00000239642", "TNFRSF8_ENSG00000120949", "RNF24_ENSG00000101236",
"CDK5_ENSG00000164885", "HINT1_ENSG00000169567", "TYRO3_ENSG00000092445",
"KRT75_ENSG00000170454", "RBM44_ENSG00000177483", "MYH8_ENSG00000133020",
"UBXN11_ENSG00000158062", "APOL3_ENSG00000128284", "NRXN3_ENSG00000021645",
"PRSS16_ENSG00000112812", "BST1_ENSG00000109743", "FAM49A_ENSG00000197872",
"SLC3A2_ENSG00000168003", "OR1C1_ENSG00000221888", "MYMK_ENSG00000187616",
"RASSF1_ENSG00000068028", "ARID5A_ENSG00000196843", "UAP1L1_ENSG00000197355",
"DPH2_ENSG00000132768", "G6PC_ENSG00000131482", "SH2B1_ENSG00000178188",
"RELL1_ENSG00000181826", "ABCC5_ENSG00000114770", "ZNF333_ENSG00000160961",
"NIF3L1_ENSG00000196290", "COMMD2_ENSG00000114744", "ZCCHC14_ENSG00000140948",
"P3H1_ENSG00000117385", "KRT14_ENSG00000186847", "SPG7_ENSG00000197912",
"ERCC6L_ENSG00000186871", "UPF1_ENSG00000005007", "FCGR3A_ENSG00000203747",
"SLC39A13_ENSG00000165915", "ACYP2_ENSG00000170634", "AL162596.1_ENSG00000285946",
"MEF2D_ENSG00000116604", "ATPAF1_ENSG00000123472", "DNAL4_ENSG00000100246",
"ADRA2A_ENSG00000150594", "ALDH3B2_ENSG00000132746", "L3MBTL3_ENSG00000198945",
"NR2E1_ENSG00000112333", "OTUD1_ENSG00000165312", "MCMDC2_ENSG00000178460",
"TXNL1_ENSG00000091164", "CES5A_ENSG00000159398", "CCL16_ENSG00000275152",
"ZBTB12_ENSG00000204366", "OGDHL_ENSG00000197444", "ARHGEF7_ENSG00000102606",
"RBM20_ENSG00000203867", "SELENOK_ENSG00000113811", "HBB_ENSG00000244734",
"WDR3_ENSG00000065183", "MAPKBP1_ENSG00000137802", "LTB4R2_ENSG00000213906",
"SLC25A15_ENSG00000102743", "ZBTB26_ENSG00000171448", "FDX2_ENSG00000267673",
"HSD3B7_ENSG00000099377", "RBFOX3_ENSG00000167281"), Pvalue = c(1.4e-300,
2.4e-150, 2.6089114579797e-07, 2.0296620694138e-06, 0.000147497259292417,
0.000229023886289315, 0.000245084674285079, 0.000256308708221289,
0.000261527824152563, 0.000288694716678695, 0.000290173032394758,
0.000320594572326915, 0.000346135729902497, 0.000355400110852,
0.000365256352980237, 0.000409731023356175, 0.000434204786603609,
0.000439775242591978, 0.000489192731765176, 0.000496753250110893,
0.00049911036273298, 0.000570787086811797, 0.000817460863988795,
0.000909350865229142, 0.000939159281654778, 0.00101875263711804,
0.00104161722087825, 0.00104642519111031, 0.0011025121215934,
0.00110797190460954, 0.00115516532029414, 0.00119237737210043,
0.00122886113380205, 0.00123316670384388, 0.00126924175390097,
0.00133083135434398, 0.00135900612361495, 0.00139601886941515,
0.00140034988031684, 0.00144667154281775, 0.00152488013161856,
0.00163920217629621, 0.00165121328565765, 0.00174281606991877,
0.00177541992540164, 0.00190567015024483, 0.00197012178338563,
0.00201154365191081, 0.00217761616500045, 0.00218849598206619,
0.00219107805420338, 0.00219952638949095, 0.0022100400174857,
0.00224988976742913, 0.00227842036080439, 0.00231351589815465,
0.00233840710255306, 0.00239368490047076, 0.00240800589782486,
0.00243072813003242, 0.00244930354205075, 0.00250643393459327,
0.00251262640919065, 0.00251308387281417, 0.00263512458389692,
0.00278748971622167, 0.00285692531240396, 0.00294631292976411,
0.0029855292366705, 0.00300042887433971, 0.00303321747691876,
0.00303431537337207, 0.00303655747990805, 0.00305247991142066,
0.00305779719421262, 0.0030773769185013, 0.00309595279588104,
0.00320602521859303, 0.00332374190234568, 0.00335845666631385,
0.00343476781423846, 0.00352132856036713, 0.0035370791144882,
0.00361921945446442, 0.00362829729460107, 0.00362925899436917,
0.00371857751928739, 0.00379170913533391, 0.00381786051662956,
0.00384603142808415, 0.0040621114920355, 0.00409131954647834,
0.00421076475281379, 0.00426968726537658, 0.00434706101829539,
0.00440972006588558, 0.00441860470852284, 0.00442578968523244,
0.00442716922579578, 0.00452215526426547, 0.00455658711791962,
0.00456768818316559, 0.00459525378983388, 0.00470562811526665,
0.00479427416502232, 0.00480697291736709, 0.00487609777383424,
0.00487626066774249, 0.0048982035968409, 0.00495106368869058,
0.00495974901689888, 0.0051182254688722, 0.00511868853158659,
0.00517459699358158, 0.0051863728177568, 0.0052533748441207,
0.0053048513357663, 0.00535144603215779, 0.00536294574878726,
0.00551084451782391, 0.00554884846488313, 0.0057184975334863,
0.00579274777888456, 0.00589230566622367, 0.00598698264647979,
0.00611781183554826, 0.00620691435617104, 0.00623285869674561,
0.00627192651777919, 0.00631120768525961, 0.00638288332792991,
0.00640000445930411, 0.00640676243762089, 0.00651734394089964,
0.0065624463096069, 0.00663922011120555, 0.00664879787639161,
0.00670461778135323, 0.00687266504207529, 0.00695679654393111,
0.00703352727799, 0.0070826001238915, 0.00709135444023445, 0.007142701991454,
0.00715597471729579, 0.00717318609326256, 0.00717726401691021,
0.00723420182380741, 0.00734437099984853), CHR = c(16L, 4L, 4L,
1L, 14L, 16L, 5L, 6L, 20L, 9L, 9L, 7L, 22L, 3L, 14L, 3L, 8L,
8L, 21L, 16L, 4L, 16L, 12L, 14L, 4L, 1L, 12L, 15L, 5L, 4L, 21L,
22L, 1L, 1L, 14L, 6L, 15L, 9L, 20L, 20L, 17L, 7L, 15L, 6L, 20L,
7L, 8L, 9L, 1L, 13L, 11L, 12L, 4L, 7L, 20L, 12L, 7L, 5L, 12L,
21L, 5L, 8L, 14L, 9L, 10L, 17L, 21L, 19L, 4L, 21L, 18L, 21L,
7L, 12L, 21L, 2L, 15L, 7L, 14L, 15L, 4L, 12L, 5L, 14L, 21L, 8L,
21L, 15L, 18L, 12L, 11L, 20L, 2L, 22L, 14L, 17L, 3L, 4L, 14L,
15L, 9L, 7L, 20L, 15L, 18L, 15L, 19L, 13L, 15L, 6L, 7L, 8L, 3L,
4L, 21L, 7L, 18L, 4L, 13L, 16L, 14L, 22L, 2L, 2L, 6L, 16L, 15L,
8L, 7L, 19L, 13L, 6L, 21L, 8L, 18L, 22L, 19L, 21L, 16L, 2L, 4L,
5L, 15L, 6L, 3L, 21L, 15L, 4L, 11L), POS = c(40665L, 197088L,
107291L, 210681L, 43546L, 79324L, 84342L, 184478L, 153093L, 180926L,
186110L, 117933L, 40682L, 54752L, 42758L, 61354L, 60378L, 157811L,
154466L, 126398L, 31037L, 115113L, 151914L, 10177L, 149587L,
79681L, 199754L, 129963L, 127032L, 175940L, 213708L, 51165L,
2584L, 166487L, 56259L, 130923L, 89219L, 170034L, 178967L, 102826L,
16982L, 188528L, 185007L, 6373L, 23298L, 199514L, 10429L, 58720L,
124518L, 210323L, 52212L, 186662L, 166963L, 58802L, 97157L, 14448L,
205795L, 70401L, 41824L, 93825L, 107954L, 207638L, 58648L, 64942L,
184005L, 19239L, 326L, 167713L, 106774L, 9145L, 174348L, 116079L,
38916L, 561L, 140433L, 123765L, 92497L, 187902L, 32027L, 63696L,
141286L, 67825L, 131698L, 120443L, 72621L, 165143L, 188862L,
52376L, 16769L, 77430L, 38655L, 145317L, 188469L, 113143L, 198322L,
26732L, 165043L, 25287L, 72392L, 12505L, 134208L, 126649L, 86308L,
199525L, 204348L, 103538L, 78610L, 176290L, 175950L, 73590L,
148494L, 151769L, 135252L, 141200L, 73351L, 45244L, 136493L,
33343L, 11165L, 915L, 80714L, 164700L, 142935L, 137224L, 554L,
92823L, 143083L, 166581L, 121459L, 19037L, 325L, 59959L, 155468L,
20896L, 33721L, 4468L, 113639L, 17103L, 184481L, 164337L, 174760L,
96405L, 207423L, 46590L, 168811L, 205743L, 74180L, 178456L, 126892L
)), row.names = c(NA, -149L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x55a80de817a0>)
In reality there are around 20,000 lines for each gene in the human genome.
Using qqman, one uses:
manhttahn(gwas_data...)
To get the plot.
I would like the same plot but with the axis broken between 8-149 and then again from 149-300 so that the bottom part isn't all compressed. qqman is unable to do this.
I have tried modifying the script from this website: https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/
And my code looks like this:
table above: gwas_data
data_cum <- gwas_data %>%
group_by(CHR) %>%
summarise(max_bp = max(BP)) %>%
mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>%
select(CHR, bp_add)
gwas_data <- gwas_data %>%
inner_join(data_cum, by = "CHR") %>%
mutate(bp_cum = bp + bp_add)
axis_set <- gwas_data %>%
group_by(CHR) %>%
summarize(center = mean(bp_cum))
ylim <- gwas_data %>%
filter(P == min(P)) %>%
mutate(ylim = abs(floor(log10(P))) + 2) %>%
pull(ylim)
sig <- 0.05/length(gwas_data$P) #this is a bonferroni correction
manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(P),
color = as_factor(CHR), size = -log10(P))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axis_set$chr)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(p)") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)
This gives me:
Which is wrong. However, if I try and then cut the axis using the ggbreak package with:
t <- manhplot +scale_y_cut(break=c(10,140))
t+ scale_y_cut(break=c(140,300))
Which gives me:
How would I sort the chromosome x-axis and the breaks out so it looks like the qqman plot but with the y-axis compressed?
Many thanks

R: stacked geom_area plot displays blank polygons

I need some help regarding transforming a geom_bar into a geom_area plot. This is my df:
dput(df)
df <- structure(list(new_day = c(-25L, 3L, 7L, -7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L,
-25L, 3L, 7L, -7L, 0L, 3L, -7L, 0L, -25L, 7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, 3L, 7L, -7L, 0L, -25L, 3L,
7L, -7L, 0L, 7L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -25L, -25L, -25L,
-25L, -25L, -25L, -25L), order = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 11L, 11L, 11L, 11L, 11L, 13L, 13L, 13L, 13L,
13L, 10L, 10L, 10L, 10L, 10L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L,
2L, 2L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L,
9L, 9L, 9L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 13L, 13L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 1L, 7L, 5L, 2L,
12L, 2L, 2L), .Label = c("Alteromonadales", "Betaproteobacteriales",
"Caulobacterales", "Chitinophagales", "Flavobacteriales", "Parvibaculales",
"Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodospirillales",
"Sneathiellales", "Sphingobacteriales", "Sphingomonadales", "Thalassobaculales"
), class = "factor"), family = structure(c(13L, 13L, 13L, 13L,
12L, 12L, 12L, 12L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 7L, 7L, 7L, 7L, 7L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L,
1L, 1L, 11L, 11L, 11L, 11L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 14L, 14L, 14L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 16L,
16L, 17L, 17L, 17L, 17L, 8L, 8L, 8L, 8L, 8L, 5L, 5L, 5L, 16L,
16L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L, 8L,
8L, 10L, 10L, 6L, 11L, 3L, 1L, 9L, 1L, 1L), .Label = c("Burkholderiaceae",
"Chitinophagaceae", "Flavobacteriaceae", "Gallaecimonadaceae",
"Hyphomonadaceae", "Idiomarinaceae", "Magnetospiraceae", "Methylophilaceae",
"NS11-12_marine_group", "Parvibaculaceae", "Pseudomonadaceae",
"Rhizobiaceae", "Rhizobiales_unclassified", "Rhodobacteraceae",
"Sneathiellaceae", "Sphingomonadaceae", "Thalassobaculaceae"), class = "factor"),
genus = structure(c(16L, 16L, 16L, 16L, 7L, 7L, 7L, 7L, 3L,
3L, 3L, 3L, 3L, 19L, 19L, 19L, 19L, 19L, 24L, 24L, 24L, 24L,
24L, 14L, 14L, 14L, 14L, 14L, 17L, 17L, 17L, 17L, 17L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 5L, 5L, 5L, 5L, 5L,
10L, 10L, 10L, 2L, 2L, 2L, 2L, 2L, 22L, 22L, 22L, 20L, 20L,
23L, 23L, 23L, 23L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L,
21L, 21L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 11L, 11L,
11L, 11L, 11L, 13L, 13L, 9L, 14L, 4L, 6L, 12L, 1L, 18L), .Label = c("Burkholderiaceae_unclassified",
"Cupriavidus", "Ferrovibrio", "Flavobacteriaceae_unclassified",
"Gallaecimonas", "GKS98_freshwater_group", "Hoeflea", "Hyphomonas",
"Idiomarina", "Marivivens", "Methylotenera", "NS11-12_marine_group_ge",
"Parvibaculum", "Pseudomonas", "Pseudorhodobacter", "Rhizobiales_unclassified",
"Rhodoferax", "RS62_marine_group", "Sphingomonadaceae_unclassified",
"Sphingopyxis", "Sphingorhabdus", "Terrimonas", "Thalassobaculum",
"uncultured"), class = "factor"), Abundance = c(0.758296593899054,
0.728046713738242, 0.421798852637834, 0.185971692147469,
7.36584152568739, 11.0004160226707, 1.93134577450352, 19.7144376530921,
46.2350237547082, 25.8715062086956, 22.1549641486618, 34.4112477828867,
20.4937613394223, 3.73518219692229, 15.9295990367068, 13.8490383262387,
13.3481723220855, 20.3866145291388, 0.165618346100574, 8.86991024549668,
8.5330814375361, 6.86819004205197, 5.72129192186814, 1.04512973253723,
3.77880217461655, 6.47871112880127, 1.12084852451492, 0.903754246093232,
19.0854333497858, 15.7152146349298, 12.3768753373503, 15.8790763239117,
10.2875187327705, 2.82159106304821, 4.22393981370602, 8.82452898193968,
4.8507226701533, 6.19619716749583, 8.28477594908417, 8.05201189383953,
9.7404731686272, 9.84535225459449, 1.7940554465653, 2.62276259756813,
2.74008811315788, 0.543937440677315, 0.55325167765205, 0.910457573040239,
0.451385497886567, 0.655661306732001, 6.59400178917785, 1.92570846362683,
2.62192443054515, 2.10049053655497, 2.13139299576524, 0.20799245164738,
0.324291631088576, 0.369492771993701, 1.52162438803598, 0.151864202275619,
0.420953084533189, 0.391517677365401, 0.29116200940885, 0.232440441774702,
4.21428798609281, 0.859779996836882, 1.33107018783728, 1.013155122065,
0.447286602320585, 0.165001492967355, 0.285983094976304,
0.377758692391269, 0.21556919104275, 0.314057858254493, 0.354649793637887,
0.338799824269294, 0.218027624939685, 0.914324162324944,
1.22932824654674, 0.731649603629864, 0.566393265064962, 0.247942012186621,
1.73171328618728, 0.636597714441988, 0.505393049999761, 0.491318560043637,
0.990988961717433, 0.195417142399681, 0.210412739808352,
0.476107780140271, 0.936663899397428, 0.251540964619117,
0.963667386912928, 0.504905545701818, 0.296220086916766,
0.240809811677774)), class = "data.frame", row.names = c(52L,
68L, 72L, 93L, 165L, 169L, 190L, 194L, 246L, 262L, 266L, 287L,
291L, 343L, 359L, 363L, 384L, 388L, 440L, 456L, 460L, 481L, 485L,
634L, 650L, 654L, 675L, 679L, 731L, 747L, 751L, 772L, 776L, 844L,
848L, 869L, 873L, 925L, 941L, 945L, 966L, 970L, 1022L, 1038L,
1042L, 1063L, 1067L, 1216L, 1232L, 1236L, 1313L, 1329L, 1333L,
1354L, 1358L, 1426L, 1451L, 1455L, 1507L, 1527L, 1717L, 1721L,
1742L, 1746L, 2186L, 2202L, 2206L, 2227L, 2231L, 2380L, 2396L,
2400L, 3075L, 3079L, 3294L, 3298L, 3350L, 3366L, 3370L, 3391L,
3395L, 3467L, 4223L, 4239L, 4243L, 4264L, 4268L, 4433L, 4437L,
4708L, 4805L, 4902L, 5193L, 5969L, 7909L, 8006L))
and this is the structure:
> str(df)
'data.frame': 96 obs. of 5 variables:
$ new_day : int -25 3 7 -7 3 7 -7 0 -25 3 ...
$ order : Factor w/ 14 levels "Alteromonadales",..: 8 8 8 8 8 8 8 8 11 11 ...
$ family : Factor w/ 17 levels "Burkholderiaceae",..: 13 13 13 13 12 12 12 12 15 15 ...
$ genus : Factor w/ 24 levels "Burkholderiaceae_unclassified",..: 16 16 16 16 7 7 7 7 3 3 ...
$ Abundance: num 0.758 0.728 0.422 0.186 7.366 ...
my data is about relative abundances of species over time, I removed rare species so it doesn't add up to 100 % anymore,
but that is fine, it is about 98 % per date. However, I get these weird free polygons and triangles which I recognize from incorrect grouping etc., but the group parameter did not change anything here. I also tried several position and stat arguments, which did not help. Maybe it is about the order of factors or something?
What I'm looking for is a stacked plot of the abundances of cumulated orders without empty spaces in between etc. Create proportional geom_area plot directly in ggplot2
# area plot combining species on order level
ggplot(df, aes(x = new_day, y = Abundance, fill = order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
I get fewer weird shapes when going to a more detailed hierarchical level (genus instead of order)
# area plot on genus level
ggplot(df, aes(x = new_day, y = Abundance, fill = genus)) +
geom_area(stat = "identity", position = "stack") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
but these are still more blank areas than there should be by the sum of abundances for a given time
# total abundance per day
sum(subset(df, new_day == -25)$Abundance)
[1] 98.03997
Any suggestions on how to fix this?
The problem is that you sometimes have several abundance values for one new_day, even with more detailed hierarchical levels.
This is what creates discontinuities in the area plot. You need to have only one unique value for each new_day. In my example below, I just take the first abundance value after grouping by new_day and order, but it is probably not relevant for what you want to show. (You may want to take the mean or attributes these values to other new_day points in between, whatever you need).
The remaining little gaps are caused by the missing abundance values, since as you said, it does not add up to 100%. This is not a big deal, but you can probably fix it by replacing the missing values by 0.
EDIT : Now doing the sum of abundance values as you mentioned, and removing the small remaining gaps by replacing missing values by 0.
library(tidyverse)
df %>%
# Sum abundance values, to only keep one per point
group_by(new_day, order) %>%
summarise(abundance=sum(Abundance)) %>%
ungroup() %>%
# Replace missing values by 0
spread(key=order, value=abundance) %>%
gather(key=order, value=abundance, -new_day) %>%
replace_na(list(abundance=0)) -> data
ggplot(data, aes(x = new_day, y = abundance, fill=order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)

support subtraction between two different itemsets when having same items

Using arules, I have got two itemsets, and I want to do subtraction between the two different itemsets when having same items.
> inspect(fsets_model_test)
items support count
[1] {SURFSKINTEMP=6,MODIS_LST=1} 0.01235235 663
[2] {TOTCO=13,MODIS_LST=1} 0.01373104 737
[3] {TOTCO=6,MODIS_LST=1} 0.01393598 748
[4] {TOTO3=15,MODIS_LST=1} 0.01265045 679
[5] {TOTH2OVAP=6,MODIS_LST=1} 0.01548236 831
[6] {TOTH2OVAP=1,MODIS_LST=1} 0.01565004 840
> inspect(fsets_nonsesmic_test)
items support count
[1] {TOTCO=6,MODIS_LST=1} 0.02192761 10013
[2] {TOTCO=13,MODIS_LST=1} 0.02261524 10327
[3] {TOTO3=15,MODIS_LST=1} 0.02432556 11108
[4] {SURFAIRTEMP=3,TOTH2OVAP=1,MODIS_LST=1} 0.01772735 8095
[5] {TOTH2OVAP=1,MODIS_LST=1} 0.02873605 13122
[6] {SURFAIRTEMP=3,TOTH2OVAP=1} 0.01856828 8479
you can see that itemsets fsets_model_test and itemsets fsets_nonsesmic_test have same items {TOTO3=15,MODIS_LST=1}
What I want to do is subtract support between two itemsets, in above case is
0.02432556 - 0.01265045 = 0.01167511, and then get a new itemsets.
How to implement this in arules, thanks
following are the example itemsets
one itemsets
fsets_model_test <- new("itemsets"
, items = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(5L, 121L, 74L, 121L, 67L, 121L, 59L, 121L, 33L, 121L, 28L,
121L)
, p = c(0L, 2L, 4L, 6L, 8L, 10L, 12L)
, Dim = c(125L, 6L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = c("SURFSKINTEMP=1", "SURFSKINTEMP=2",
"SURFSKINTEMP=3", "SURFSKINTEMP=4", "SURFSKINTEMP=5", "SURFSKINTEMP=6",
"SURFSKINTEMP=7", "SURFSKINTEMP=8", "SURFSKINTEMP=9", "SURFSKINTEMP=10",
"SURFSKINTEMP=11", "SURFSKINTEMP=12", "SURFSKINTEMP=13", "SURFSKINTEMP=14",
"SURFSKINTEMP=15", "SURFSKINTEMP=16", "SURFAIRTEMP=1", "SURFAIRTEMP=2",
"SURFAIRTEMP=3", "SURFAIRTEMP=4", "SURFAIRTEMP=5", "SURFAIRTEMP=6",
"SURFAIRTEMP=7", "SURFAIRTEMP=8", "SURFAIRTEMP=9", "SURFAIRTEMP=10",
"SURFAIRTEMP=11", "SURFAIRTEMP=12", "TOTH2OVAP=1", "TOTH2OVAP=2",
"TOTH2OVAP=3", "TOTH2OVAP=4", "TOTH2OVAP=5", "TOTH2OVAP=6", "TOTH2OVAP=7",
"TOTH2OVAP=8", "TOTH2OVAP=9", "TOTH2OVAP=10", "TOTH2OVAP=11",
"TOTH2OVAP=12", "TOTH2OVAP=13", "TOTH2OVAP=14", "TOTH2OVAP=15",
"TOTH2OVAP=16", "TOTH2OVAP=17", "TOTO3=1", "TOTO3=2", "TOTO3=3",
"TOTO3=4", "TOTO3=5", "TOTO3=6", "TOTO3=7", "TOTO3=8", "TOTO3=9",
"TOTO3=10", "TOTO3=11", "TOTO3=12", "TOTO3=13", "TOTO3=14", "TOTO3=15",
"TOTO3=16", "TOTO3=17", "TOTCO=1", "TOTCO=2", "TOTCO=3", "TOTCO=4",
"TOTCO=5", "TOTCO=6", "TOTCO=7", "TOTCO=8", "TOTCO=9", "TOTCO=10",
"TOTCO=11", "TOTCO=12", "TOTCO=13", "TOTCO=14", "TOTCO=15", "TOTCH4=1",
"TOTCH4=2", "TOTCH4=3", "TOTCH4=4", "TOTCH4=5", "TOTCH4=6", "TOTCH4=7",
"TOTCH4=8", "TOTCH4=9", "TOTCH4=10", "TOTCH4=11", "TOTCH4=12",
"TOTCH4=13", "TOTCH4=14", "OLR_ARIS=1", "OLR_ARIS=2", "OLR_ARIS=3",
"OLR_ARIS=4", "OLR_ARIS=5", "OLR_ARIS=6", "OLR_ARIS=7", "OLR_ARIS=8",
"OLR_ARIS=9", "OLR_ARIS=10", "CLROLR_ARIS=1", "CLROLR_ARIS=2",
"CLROLR_ARIS=3", "CLROLR_ARIS=4", "CLROLR_ARIS=5", "CLROLR_ARIS=6",
"CLROLR_ARIS=7", "CLROLR_ARIS=8", "CLROLR_ARIS=9", "CLROLR_ARIS=10",
"OLR_NOAA=1", "OLR_NOAA=2", "OLR_NOAA=3", "OLR_NOAA=4", "OLR_NOAA=5",
"OLR_NOAA=6", "OLR_NOAA=7", "OLR_NOAA=8", "OLR_NOAA=9", "OLR_NOAA=10",
"MODIS_LST=1", "MODIS_LST=2", "MODIS_LST=3", "MODIS_LST=4"),
variables = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L), .Label = c("CLROLR_ARIS", "MODIS_LST", "OLR_ARIS", "OLR_NOAA",
"SURFAIRTEMP", "SURFSKINTEMP", "TOTCH4", "TOTCO", "TOTH2OVAP",
"TOTO3"), class = "factor"), levels = structure(c(1L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L,
1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 1L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 2L, 1L, 10L, 11L, 12L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -125L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, tidLists = NULL
, quality = structure(list(support = c(0.0123523493684093, 0.0137310429630734,
0.0139359839028207, 0.0126504452807691, 0.0154823564481872, 0.0156500353988896
), count = c(663, 737, 748, 679, 831, 840)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
, info = structure(list(data = model_data_tr, ntransactions = 53674L,
support = 0.01), .Names = c("data", "ntransactions", "support"
))
)
another itemsets is:
fsets_nonsesmic_test <- new("itemsets"
, items = new("itemMatrix"
, data = new("ngCMatrix"
, i = c(67L, 121L, 74L, 121L, 59L, 121L, 18L, 28L, 121L, 28L, 121L,
18L, 28L)
, p = c(0L, 2L, 4L, 6L, 9L, 11L, 13L)
, Dim = c(125L, 6L)
, Dimnames = list(NULL, NULL)
, factors = list()
)
, itemInfo = structure(list(labels = c("SURFSKINTEMP=1", "SURFSKINTEMP=2",
"SURFSKINTEMP=3", "SURFSKINTEMP=4", "SURFSKINTEMP=5", "SURFSKINTEMP=6",
"SURFSKINTEMP=7", "SURFSKINTEMP=8", "SURFSKINTEMP=9", "SURFSKINTEMP=10",
"SURFSKINTEMP=11", "SURFSKINTEMP=12", "SURFSKINTEMP=13", "SURFSKINTEMP=14",
"SURFSKINTEMP=15", "SURFSKINTEMP=16", "SURFAIRTEMP=1", "SURFAIRTEMP=2",
"SURFAIRTEMP=3", "SURFAIRTEMP=4", "SURFAIRTEMP=5", "SURFAIRTEMP=6",
"SURFAIRTEMP=7", "SURFAIRTEMP=8", "SURFAIRTEMP=9", "SURFAIRTEMP=10",
"SURFAIRTEMP=11", "SURFAIRTEMP=12", "TOTH2OVAP=1", "TOTH2OVAP=2",
"TOTH2OVAP=3", "TOTH2OVAP=4", "TOTH2OVAP=5", "TOTH2OVAP=6", "TOTH2OVAP=7",
"TOTH2OVAP=8", "TOTH2OVAP=9", "TOTH2OVAP=10", "TOTH2OVAP=11",
"TOTH2OVAP=12", "TOTH2OVAP=13", "TOTH2OVAP=14", "TOTH2OVAP=15",
"TOTH2OVAP=16", "TOTH2OVAP=17", "TOTO3=1", "TOTO3=2", "TOTO3=3",
"TOTO3=4", "TOTO3=5", "TOTO3=6", "TOTO3=7", "TOTO3=8", "TOTO3=9",
"TOTO3=10", "TOTO3=11", "TOTO3=12", "TOTO3=13", "TOTO3=14", "TOTO3=15",
"TOTO3=16", "TOTO3=17", "TOTCO=1", "TOTCO=2", "TOTCO=3", "TOTCO=4",
"TOTCO=5", "TOTCO=6", "TOTCO=7", "TOTCO=8", "TOTCO=9", "TOTCO=10",
"TOTCO=11", "TOTCO=12", "TOTCO=13", "TOTCO=14", "TOTCO=15", "TOTCH4=1",
"TOTCH4=2", "TOTCH4=3", "TOTCH4=4", "TOTCH4=5", "TOTCH4=6", "TOTCH4=7",
"TOTCH4=8", "TOTCH4=9", "TOTCH4=10", "TOTCH4=11", "TOTCH4=12",
"TOTCH4=13", "TOTCH4=14", "OLR_ARIS=1", "OLR_ARIS=2", "OLR_ARIS=3",
"OLR_ARIS=4", "OLR_ARIS=5", "OLR_ARIS=6", "OLR_ARIS=7", "OLR_ARIS=8",
"OLR_ARIS=9", "OLR_ARIS=10", "CLROLR_ARIS=1", "CLROLR_ARIS=2",
"CLROLR_ARIS=3", "CLROLR_ARIS=4", "CLROLR_ARIS=5", "CLROLR_ARIS=6",
"CLROLR_ARIS=7", "CLROLR_ARIS=8", "CLROLR_ARIS=9", "CLROLR_ARIS=10",
"OLR_NOAA=1", "OLR_NOAA=2", "OLR_NOAA=3", "OLR_NOAA=4", "OLR_NOAA=5",
"OLR_NOAA=6", "OLR_NOAA=7", "OLR_NOAA=8", "OLR_NOAA=9", "OLR_NOAA=10",
"MODIS_LST=1", "MODIS_LST=2", "MODIS_LST=3", "MODIS_LST=4"),
variables = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L), .Label = c("CLROLR_ARIS", "MODIS_LST", "OLR_ARIS", "OLR_NOAA",
"SURFAIRTEMP", "SURFSKINTEMP", "TOTCH4", "TOTCO", "TOTH2OVAP",
"TOTO3"), class = "factor"), levels = structure(c(1L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L,
1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 2L, 3L, 4L, 5L, 6L, 1L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 2L, 1L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 2L, 1L, 10L, 11L, 12L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "factor")), .Names = c("labels",
"variables", "levels"), row.names = c(NA, -125L), class = "data.frame")
, itemsetInfo = structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame")
)
, tidLists = NULL
, quality = structure(list(support = c(0.0219276058330541, 0.0226152387334415,
0.024325561329628, 0.0177273513650827, 0.0287360475123675, 0.0185682782241552
), count = c(10013, 10327, 11108, 8095, 13122, 8479)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
, info = structure(list(data = nonsesmic_data_tr, ntransactions = 456639L,
support = 0.01), .Names = c("data", "ntransactions", "support"
))
)
If the two sets come from transaction data that are compatible (see ? itemCoding) then you can use match to find matching itemsets in the two sets. After that, it should be easy to subtract the support.

R: Error when using knncat to do classification on categorical variable

I have a data set with 4 columns, 2 of which are numeric, 1 is categorical and 1 is the label. The label has 13 levels (A to M). I tried to use knncat package in R to do classification, but every time I ran the code, I got the following error message:
Error in `[<-.data.frame`(`*tmp*`, factor.vars, value = c("M", "J", "K", :
replacement has 45500 rows, data has 1
The following is the code I used:
data <- read.csv('mosaic_data2.csv', header = T)
num <- dim(data)[1]
library(sampling)
set.seed(1234)
train_index <- sample(seq(1,num,1), floor(num * 0.7), replace = F)
test_index <- setdiff(seq(1,num,1), train_index)
train_data <- data[train_index,]
test_data <- data[test_index,]
library(knncat)
model <- knncat(train_data, classcol = 2)
Could anyone please take a look at the code and advise how I could eliminate this bug? Thank you very much!
The output of dput(head(data,100)) is as follows:
structure(list(latitude = c(52.7326028, 52.74287543, 52.82107841,
52.82025363, 52.81980596, 52.81721897, 52.81274172, 52.81274172,
52.8089586, 52.81424219, 52.8089586, 52.74007929, 52.77394023,
52.73659034, 52.73672518, 52.73764626, 52.73753744, 52.73659034,
52.73815233, 52.73679388, 52.73890319, 52.71697237, 52.63730282,
52.62720385, 52.63730282, 52.63543017, 52.63768035, 52.63510366,
52.6346578, 52.6346578, 52.6346578, 52.63447454, 52.63576418,
52.63447454, 52.6346578, 52.63447454, 52.69820719, 52.69603926,
52.68246919, 52.54600173, 52.54210198, 52.60628983, 52.61003275,
52.60278236, 52.60239604, 52.60348688, 52.60239604, 52.60382146,
52.60315644, 52.86047938, 52.86576353, 52.86954228, 52.81039471,
52.82094872, 52.82395073, 52.82444705, 52.88098384, 52.88469208,
52.88469208, 52.84979201, 52.84720159, 52.84831759, 52.82435938,
52.82319493, 52.82168337, 52.8230402, 52.8230402, 52.82513486,
52.82472379, 52.82756385, 52.82475438, 52.82434902, 52.82166611,
52.823712, 52.82401481, 52.82483489, 52.82103704, 52.82060763,
52.8208682, 52.82211317, 52.81868547, 52.8198332, 52.82023595,
52.81989134, 52.8196971, 52.82051066, 52.82463338, 52.82539131,
52.82580625, 52.82509199, 52.83759415, 52.83946254, 52.83946254,
52.83891871, 52.83821538, 52.84757879, 52.84663773, 52.8449371,
52.84592185, 52.84331619), longitude = c(-6.892397941, -6.915346343,
-6.922554014, -6.924997835, -6.926099967, -6.883340697, -6.897757597,
-6.897757597, -6.895500952, -6.883129556, -6.895500952, -6.703781864,
-6.680851783, -6.771845364, -6.773301282, -6.772958488, -6.77484647,
-6.771845364, -6.773422218, -6.772164896, -6.770622695, -6.784187251,
-6.901922588, -6.905109015, -6.901922588, -6.976679508, -6.973114498,
-6.974753462, -6.947990431, -6.947990431, -6.947990431, -6.976921427,
-6.958295227, -6.976921427, -6.947990431, -6.976921427, -6.902010609,
-6.915233457, -6.871160885, -6.832461149, -6.862126342, -6.943925285,
-6.93813643, -6.925128034, -6.932247524, -6.93461305, -6.932247524,
-6.934657053, -6.929283954, -6.845259603, -6.861188287, -6.866476268,
-6.940851164, -6.939203401, -6.930506188, -6.933317462, -6.929441954,
-6.922589037, -6.922589037, -6.926037258, -6.929423169, -6.917829279,
-6.938211918, -6.940658091, -6.940651748, -6.940107883, -6.940107883,
-6.938704642, -6.939084526, -6.933331264, -6.937496468, -6.937678962,
-6.940276221, -6.94018054, -6.939876475, -6.938983181, -6.934235666,
-6.93387209, -6.933134226, -6.934193569, -6.934383596, -6.933832641,
-6.937454656, -6.933818238, -6.93443811, -6.936913947, -6.920030341,
-6.920400963, -6.92215006, -6.910771124, -6.901500591, -6.899018998,
-6.899018998, -6.903007684, -6.90119821, -6.91063672, -6.909935672,
-6.90240965, -6.900066763, -6.901411136), mosaic_group = structure(c(10L,
10L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 10L, 10L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 10L, 10L, 10L, 13L, 13L, 13L, 13L,
9L, 6L, 6L, 6L, 6L, 6L, 10L, 8L, 8L, 9L, 9L, 9L, 9L, 7L, 7L,
7L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 8L, 8L, 8L, 8L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 8L,
6L, 6L, 6L, 6L, 6L, 8L, 8L, 10L, 10L, 10L), .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"), class = "factor"),
small_code = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 14L, 14L, 14L, 14L, 14L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 28L, 28L, 28L
)), .Names = c("latitude", "longitude", "mosaic_group", "small_code"
), row.names = c(NA, 100L), class = "data.frame")
The function knncat::knncat accepts the argument classcol which is defined as:
Column with classification in it. Default: 1.
You have a data set with structure:
latitude longitude mosaic_group small_code
1 52.73260 -6.892398 J 1
2 52.74288 -6.915346 J 1
3 52.82108 -6.922554 H 2
4 52.82025 -6.924998 H 2
5 52.81981 -6.926100 H 2
6 52.81722 -6.883341 G 3
Therefore your argument should be classcol = 3 (or 4) I am assuming, but we can see that it certainly shouldn't be classcol = 2.

R - Manual legend color in geom_line ggplot2

I am trying to manually change the color of only the first item of a legend in a ggplot2 line plot.
I have several observations of a variable that I am displaying in a line plot, just like this:
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable ) ) +
geom_line(size=1) + geom_point(size=2.5) + theme_grey(base_size = 18) +
xlab(NULL) + ylab('%') + theme(legend.title = element_blank()) + theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") + theme(plot.title = element_text(hjust = 0))
However, the first variable (CRU) is my reference and I would like to show its line in black. I managed to do this by adding one extra geom_line with the condition variable=='CRU':
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable ) ) +
geom_line(size=1) + geom_point(size=2.5) + theme_grey(base_size = 18) +
geom_line(data=subset(tmp1, variable == "CRU"), colour="black", linetype="solid", size=1) +
geom_point(data=subset(tmp1, variable == "CRU"), colour="black", size=2.5) +
xlab(NULL) + ylab('%') + theme(legend.title = element_blank()) + theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") + theme(plot.title = element_text(hjust = 0))
which works for the line, but the legend keeps the old colour.
How can I change the color of just the first element of the legend, in order to match the new black line?
This is an example of my data:
library(ggplot2)
tmp1 <- structure(list(month = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("Jan", "Feb", "Mar",
"Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L
), .Label = c("CRU", "CanESM2", "GFDL-ESM2M", "GISS-E2-H", "GISS-E2-R-CC",
"GISS-E2-R", "HadGEM2-AO", "HadGEM2-CC", "IPSL-CM5A-MR", "IPSL-CM5B-LR",
"MIROC4h", "MRI-CGCM3", "NorESM1-M", "bcc-csm1-1-m", "bcc-csm1-1",
"inmcm4"), class = "factor"), value = c(68.9226631460789, 68.2418796877392,
68.3045372212868, 66.5727907036073, 64.278360290491, 60.6452267972856,
56.4079999829923, 57.4384828307567, 60.874295882443, 63.70427487797,
65.9934520468731, 68.9723871966257, 69.0959015590216, 68.6126351492122,
65.9106136896166, 65.790169283913, 64.6320994816801, 63.894111784301,
62.0459530253135, 60.0455773681386, 59.4195693791228, 59.8531302282566,
62.8877658601921, 66.4625078340445, 63.4659654507164, 64.5466810117518,
63.6412932878715, 61.5786848043378, 60.6491980933614, 63.5160886052168,
62.739218138279, 60.8826348052995, 60.1196738813257, 59.0451443027396,
58.9044684656519, 61.5033887899156, 62.442928703121, 61.9933297554931,
61.686560285787, 62.1675956585161, 63.0625380934021, 63.3192922622326,
62.6727899590586, 60.9706714311941, 59.4656895840826, 59.8689092461429,
60.7585523645951, 62.2374164636759, 62.2586495696979, 62.3005886556949,
62.0719314334763, 61.7786313583016, 62.1037020616999, 62.5919637033876,
60.7746642298107, 58.7307471416832, 57.6602849809809, 57.3379551651851,
59.8210398283061, 61.5997238276034, 62.1190176575675, 62.2214930174241,
61.607539296931, 61.836536870373, 61.8298589429815, 62.0478835210295,
60.8165122782774, 59.224498365607, 57.5387307267022, 56.8641846144649,
59.6779581588162, 61.5822371331742, 56.9625864272884, 55.0519081266715,
53.9161532646461, 52.0847886852487, 54.1855963059705, 54.1565901942167,
53.8164314129289, 53.3013959169719, 52.1283494730607, 49.9814907883562,
51.0053330490513, 54.1758812796363, 54.1947459143536, 53.2985061657513,
51.5351727215781, 51.2131541342776, 53.040182168441, 53.4657505459587,
52.8257974728027, 52.8523832284788, 51.2527233914323, 48.0999294191007,
48.3915726340961, 50.9305288780026, 65.3647375158419, 64.6894843930494,
62.2700707798592, 60.2848148985731, 59.0797813854392, 58.6641353922813,
60.36671822738, 61.0883458866571, 60.3963355506111, 60.989444946264,
62.1570976843054, 64.0549504714623, 63.043822206253, 61.5388900651697,
61.0125502971802, 60.4999006674972, 60.9554692113674, 61.2665703834057,
61.1470225339614, 61.4827838311531, 60.0397138517742, 61.6503963603034,
62.7421837830534, 63.9911949044232, 55.7117557057576, 55.0687784028633,
51.7447044604762, 50.5160095376821, 51.7744811245234, 52.6710116909617,
52.9126480516047, 51.6347065362984, 50.6773480024225, 48.8928054774924,
50.3505731163001, 53.7488684714513, 61.558109087334, 61.6673093977654,
61.008465555097, 58.5478578294864, 57.4119260976748, 57.9275733769477,
56.9129774651439, 55.6494927089111, 52.0222406797903, 51.9215916366208,
53.4679949695072, 58.2128251869788, 64.7955701998493, 62.8319013929061,
60.8391061131818, 56.1759467734789, 55.4331550199683, 55.8437923896573,
54.998540828777, 54.7840203124691, 54.3853750266133, 52.7590435522892,
56.1409799671355, 62.0047140533332, 57.5185465474672, 57.2532289998115,
55.9911913829976, 54.6479285609432, 53.1659722964534, 53.3609799276622,
51.321452599498, 49.6933914680193, 48.6718229103421, 49.5393207890844,
52.8096091918065, 56.1667672797739, 60.7380412023987, 60.1791897430251,
58.7798069796932, 58.061108119255, 59.7770862278418, 60.2070273632675,
59.074898814382, 55.5571990297011, 53.8564792650491, 54.0753885029223,
56.2369958393563, 58.9062125901571, 70.7538119957697, 69.4271857400385,
67.3954189057409, 66.9262104442679, 67.1558044757422, 65.8848885390536,
65.3092556552615, 64.3799468889004, 64.9999333535186, 65.6493831700943,
69.2646980549075, 70.6342115226731)), row.names = c(NA, -192L
), .Names = c("month", "variable", "value"), class = "data.frame")
Instead of splitting up the data and plotting two geom_lines, you can simply supply a custom colour palette in which CRU is mapped to black.
If you want to keep the default colours for the other variables, you first need to define a little helper function to retrieve them the way ggplot2 does it.
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
Then create a custom colour palette vector, combining the standard palette and black. Since CRU is the first level of your factor variable (with 16 levels in total), this is simply
custom_palette <- c("#000000", gg_color_hue(15))
The following then produces your desired plot:
ggplot(tmp1, aes(x=factor(month), y=value, group=variable, colour=variable)) +
geom_line(size=1) +
geom_point(size=2.5) +
scale_colour_manual(values=custom_palette) +
theme_grey(base_size = 18) +
xlab(NULL) + ylab('%') +
theme(legend.title = element_blank()) +
theme(axis.text.x=element_blank()) +
ggtitle("a) Cloud fraction") +
theme(plot.title = element_text(hjust = 0))

Resources