R: stacked geom_area plot displays blank polygons - r

I need some help regarding transforming a geom_bar into a geom_area plot. This is my df:
dput(df)
df <- structure(list(new_day = c(-25L, 3L, 7L, -7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L,
-25L, 3L, 7L, -7L, 0L, 3L, -7L, 0L, -25L, 7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, 3L, 7L, -7L, 0L, -25L, 3L,
7L, -7L, 0L, 7L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -25L, -25L, -25L,
-25L, -25L, -25L, -25L), order = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 11L, 11L, 11L, 11L, 11L, 13L, 13L, 13L, 13L,
13L, 10L, 10L, 10L, 10L, 10L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L,
2L, 2L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L,
9L, 9L, 9L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 13L, 13L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 1L, 7L, 5L, 2L,
12L, 2L, 2L), .Label = c("Alteromonadales", "Betaproteobacteriales",
"Caulobacterales", "Chitinophagales", "Flavobacteriales", "Parvibaculales",
"Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodospirillales",
"Sneathiellales", "Sphingobacteriales", "Sphingomonadales", "Thalassobaculales"
), class = "factor"), family = structure(c(13L, 13L, 13L, 13L,
12L, 12L, 12L, 12L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 7L, 7L, 7L, 7L, 7L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L,
1L, 1L, 11L, 11L, 11L, 11L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 14L, 14L, 14L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 16L,
16L, 17L, 17L, 17L, 17L, 8L, 8L, 8L, 8L, 8L, 5L, 5L, 5L, 16L,
16L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L, 8L,
8L, 10L, 10L, 6L, 11L, 3L, 1L, 9L, 1L, 1L), .Label = c("Burkholderiaceae",
"Chitinophagaceae", "Flavobacteriaceae", "Gallaecimonadaceae",
"Hyphomonadaceae", "Idiomarinaceae", "Magnetospiraceae", "Methylophilaceae",
"NS11-12_marine_group", "Parvibaculaceae", "Pseudomonadaceae",
"Rhizobiaceae", "Rhizobiales_unclassified", "Rhodobacteraceae",
"Sneathiellaceae", "Sphingomonadaceae", "Thalassobaculaceae"), class = "factor"),
genus = structure(c(16L, 16L, 16L, 16L, 7L, 7L, 7L, 7L, 3L,
3L, 3L, 3L, 3L, 19L, 19L, 19L, 19L, 19L, 24L, 24L, 24L, 24L,
24L, 14L, 14L, 14L, 14L, 14L, 17L, 17L, 17L, 17L, 17L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 5L, 5L, 5L, 5L, 5L,
10L, 10L, 10L, 2L, 2L, 2L, 2L, 2L, 22L, 22L, 22L, 20L, 20L,
23L, 23L, 23L, 23L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L,
21L, 21L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 11L, 11L,
11L, 11L, 11L, 13L, 13L, 9L, 14L, 4L, 6L, 12L, 1L, 18L), .Label = c("Burkholderiaceae_unclassified",
"Cupriavidus", "Ferrovibrio", "Flavobacteriaceae_unclassified",
"Gallaecimonas", "GKS98_freshwater_group", "Hoeflea", "Hyphomonas",
"Idiomarina", "Marivivens", "Methylotenera", "NS11-12_marine_group_ge",
"Parvibaculum", "Pseudomonas", "Pseudorhodobacter", "Rhizobiales_unclassified",
"Rhodoferax", "RS62_marine_group", "Sphingomonadaceae_unclassified",
"Sphingopyxis", "Sphingorhabdus", "Terrimonas", "Thalassobaculum",
"uncultured"), class = "factor"), Abundance = c(0.758296593899054,
0.728046713738242, 0.421798852637834, 0.185971692147469,
7.36584152568739, 11.0004160226707, 1.93134577450352, 19.7144376530921,
46.2350237547082, 25.8715062086956, 22.1549641486618, 34.4112477828867,
20.4937613394223, 3.73518219692229, 15.9295990367068, 13.8490383262387,
13.3481723220855, 20.3866145291388, 0.165618346100574, 8.86991024549668,
8.5330814375361, 6.86819004205197, 5.72129192186814, 1.04512973253723,
3.77880217461655, 6.47871112880127, 1.12084852451492, 0.903754246093232,
19.0854333497858, 15.7152146349298, 12.3768753373503, 15.8790763239117,
10.2875187327705, 2.82159106304821, 4.22393981370602, 8.82452898193968,
4.8507226701533, 6.19619716749583, 8.28477594908417, 8.05201189383953,
9.7404731686272, 9.84535225459449, 1.7940554465653, 2.62276259756813,
2.74008811315788, 0.543937440677315, 0.55325167765205, 0.910457573040239,
0.451385497886567, 0.655661306732001, 6.59400178917785, 1.92570846362683,
2.62192443054515, 2.10049053655497, 2.13139299576524, 0.20799245164738,
0.324291631088576, 0.369492771993701, 1.52162438803598, 0.151864202275619,
0.420953084533189, 0.391517677365401, 0.29116200940885, 0.232440441774702,
4.21428798609281, 0.859779996836882, 1.33107018783728, 1.013155122065,
0.447286602320585, 0.165001492967355, 0.285983094976304,
0.377758692391269, 0.21556919104275, 0.314057858254493, 0.354649793637887,
0.338799824269294, 0.218027624939685, 0.914324162324944,
1.22932824654674, 0.731649603629864, 0.566393265064962, 0.247942012186621,
1.73171328618728, 0.636597714441988, 0.505393049999761, 0.491318560043637,
0.990988961717433, 0.195417142399681, 0.210412739808352,
0.476107780140271, 0.936663899397428, 0.251540964619117,
0.963667386912928, 0.504905545701818, 0.296220086916766,
0.240809811677774)), class = "data.frame", row.names = c(52L,
68L, 72L, 93L, 165L, 169L, 190L, 194L, 246L, 262L, 266L, 287L,
291L, 343L, 359L, 363L, 384L, 388L, 440L, 456L, 460L, 481L, 485L,
634L, 650L, 654L, 675L, 679L, 731L, 747L, 751L, 772L, 776L, 844L,
848L, 869L, 873L, 925L, 941L, 945L, 966L, 970L, 1022L, 1038L,
1042L, 1063L, 1067L, 1216L, 1232L, 1236L, 1313L, 1329L, 1333L,
1354L, 1358L, 1426L, 1451L, 1455L, 1507L, 1527L, 1717L, 1721L,
1742L, 1746L, 2186L, 2202L, 2206L, 2227L, 2231L, 2380L, 2396L,
2400L, 3075L, 3079L, 3294L, 3298L, 3350L, 3366L, 3370L, 3391L,
3395L, 3467L, 4223L, 4239L, 4243L, 4264L, 4268L, 4433L, 4437L,
4708L, 4805L, 4902L, 5193L, 5969L, 7909L, 8006L))
and this is the structure:
> str(df)
'data.frame': 96 obs. of 5 variables:
$ new_day : int -25 3 7 -7 3 7 -7 0 -25 3 ...
$ order : Factor w/ 14 levels "Alteromonadales",..: 8 8 8 8 8 8 8 8 11 11 ...
$ family : Factor w/ 17 levels "Burkholderiaceae",..: 13 13 13 13 12 12 12 12 15 15 ...
$ genus : Factor w/ 24 levels "Burkholderiaceae_unclassified",..: 16 16 16 16 7 7 7 7 3 3 ...
$ Abundance: num 0.758 0.728 0.422 0.186 7.366 ...
my data is about relative abundances of species over time, I removed rare species so it doesn't add up to 100 % anymore,
but that is fine, it is about 98 % per date. However, I get these weird free polygons and triangles which I recognize from incorrect grouping etc., but the group parameter did not change anything here. I also tried several position and stat arguments, which did not help. Maybe it is about the order of factors or something?
What I'm looking for is a stacked plot of the abundances of cumulated orders without empty spaces in between etc. Create proportional geom_area plot directly in ggplot2
# area plot combining species on order level
ggplot(df, aes(x = new_day, y = Abundance, fill = order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
I get fewer weird shapes when going to a more detailed hierarchical level (genus instead of order)
# area plot on genus level
ggplot(df, aes(x = new_day, y = Abundance, fill = genus)) +
geom_area(stat = "identity", position = "stack") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
but these are still more blank areas than there should be by the sum of abundances for a given time
# total abundance per day
sum(subset(df, new_day == -25)$Abundance)
[1] 98.03997
Any suggestions on how to fix this?

The problem is that you sometimes have several abundance values for one new_day, even with more detailed hierarchical levels.
This is what creates discontinuities in the area plot. You need to have only one unique value for each new_day. In my example below, I just take the first abundance value after grouping by new_day and order, but it is probably not relevant for what you want to show. (You may want to take the mean or attributes these values to other new_day points in between, whatever you need).
The remaining little gaps are caused by the missing abundance values, since as you said, it does not add up to 100%. This is not a big deal, but you can probably fix it by replacing the missing values by 0.
EDIT : Now doing the sum of abundance values as you mentioned, and removing the small remaining gaps by replacing missing values by 0.
library(tidyverse)
df %>%
# Sum abundance values, to only keep one per point
group_by(new_day, order) %>%
summarise(abundance=sum(Abundance)) %>%
ungroup() %>%
# Replace missing values by 0
spread(key=order, value=abundance) %>%
gather(key=order, value=abundance, -new_day) %>%
replace_na(list(abundance=0)) -> data
ggplot(data, aes(x = new_day, y = abundance, fill=order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)

Related

Undirected network graph calculated by tidygraph shows more degree centrality than should be possible

I have a cleaned data set with 26 nodes. I am placing these 26 nodes in an undirected network graph using tidygraph, where I use the centrality_degree() function to calculate the centrality degree. However, when I graph the resulting network, my highest possible centrality degree is 40, which should not be possible. When I change the graph to directed, this is corrected.
I somewhat confused, as other methods I have used in the past, where I manually calculated the centrality degree, I have never once come across this issue.
Is this regular behaviour, or am I doing something wrong?
Reproducible example:
library(tidygraph)
library(ggraph)
library(tidyverse)
nodes <- structure(list(id = 1:26, label = c("a", "b", "c", "d", "e",
"f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
"s", "t", "u", "v", "w", "x", "y", "z")), row.names = c(NA, -26L
), class = "data.frame")
edges <- structure(list(from = c(21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 17L, 17L, 17L, 17L, 17L),
to = c(1L, 12L, 3L, 16L, 24L, 4L, 10L, 6L, 22L, 2L, 8L, 1L,
12L, 13L, 3L, 18L, 16L, 24L, 5L, 7L, 14L, 4L, 10L, 6L, 9L,
22L, 15L, 2L, 20L, 8L, 21L, 12L, 13L, 3L, 16L, 24L, 5L, 7L,
14L, 4L, 10L, 6L, 22L, 15L, 2L, 8L, 17L, 21L, 1L, 13L, 3L,
16L, 5L, 7L, 14L, 10L, 6L, 9L, 22L, 15L, 2L, 20L, 8L, 17L,
21L, 1L, 3L, 18L, 16L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L,
22L, 15L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 18L, 16L,
24L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L, 22L, 15L, 20L, 8L,
17L, 1L, 3L, 10L, 6L, 22L, 20L, 8L, 21L, 11L, 1L, 13L, 3L,
18L, 24L, 7L, 4L, 10L, 6L, 25L, 9L, 22L, 15L, 2L, 20L, 8L,
17L, 21L, 11L, 1L, 12L, 13L, 18L, 16L, 5L, 7L, 14L, 10L,
6L, 25L, 9L, 22L, 15L, 20L, 8L, 17L, 1L, 3L, 18L, 16L, 7L,
14L, 4L, 10L, 6L, 9L, 22L, 15L, 2L, 20L, 8L, 17L, 21L, 11L,
1L, 12L, 13L, 3L, 18L, 16L, 24L, 14L, 4L, 10L, 6L, 25L, 9L,
22L, 15L, 2L, 20L, 8L, 11L, 1L, 3L, 18L, 16L, 7L, 10L, 6L,
9L, 22L, 15L, 2L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L,
18L, 16L, 24L, 5L, 7L, 14L, 10L, 6L, 25L, 9L, 22L, 15L, 2L,
20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L, 18L, 16L, 24L,
5L, 7L, 14L, 4L, 6L, 25L, 9L, 22L, 15L, 2L, 20L, 8L, 17L,
21L, 11L, 1L, 12L, 13L, 3L, 18L, 24L, 5L, 7L, 14L, 4L, 10L,
25L, 9L, 22L, 15L, 2L, 20L, 8L, 21L, 1L, 13L, 3L, 18L, 5L,
10L, 6L, 22L, 2L, 20L, 8L, 21L, 1L, 13L, 3L, 18L, 16L, 24L,
4L, 10L, 6L, 22L, 15L, 2L, 20L, 8L, 11L, 1L, 12L, 13L, 3L,
16L, 24L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L, 15L, 2L, 20L,
8L, 17L, 21L, 1L, 12L, 3L, 18L, 16L, 24L, 7L, 10L, 6L, 25L,
9L, 22L, 2L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L, 18L,
16L, 24L, 5L, 7L, 14L, 4L, 6L, 25L, 9L, 22L, 15L, 20L, 8L,
17L, 21L, 11L, 1L, 3L, 16L, 24L, 7L, 10L, 6L, 22L, 2L, 8L,
21L, 11L, 1L, 12L, 13L, 3L, 18L, 16L, 24L, 14L, 4L, 10L,
6L, 25L, 9L, 22L, 2L, 20L, 7L, 6L, 25L, 22L, 8L), weight = c(3L,
1L, 3L, 2L, 1L, 1L, 5L, 1L, 8L, 2L, 1L, 2L, 3L, 2L, 5L, 1L,
4L, 1L, 4L, 4L, 4L, 1L, 5L, 13L, 3L, 7L, 3L, 2L, 3L, 8L,
1L, 1L, 1L, 15L, 10L, 7L, 2L, 4L, 2L, 5L, 19L, 23L, 6L, 2L,
11L, 7L, 1L, 1L, 2L, 3L, 3L, 5L, 4L, 5L, 4L, 4L, 21L, 2L,
9L, 8L, 1L, 1L, 12L, 1L, 2L, 1L, 3L, 1L, 6L, 6L, 5L, 6L,
1L, 6L, 22L, 2L, 2L, 9L, 8L, 3L, 13L, 1L, 5L, 6L, 4L, 10L,
13L, 3L, 41L, 46L, 11L, 39L, 9L, 55L, 2L, 108L, 2L, 8L, 31L,
30L, 13L, 39L, 2L, 2L, 1L, 3L, 4L, 8L, 5L, 1L, 8L, 1L, 6L,
1L, 8L, 2L, 3L, 23L, 2L, 12L, 96L, 1L, 3L, 21L, 1L, 6L, 12L,
38L, 4L, 5L, 4L, 4L, 8L, 8L, 3L, 29L, 3L, 11L, 3L, 3L, 63L,
2L, 5L, 18L, 19L, 4L, 25L, 1L, 2L, 3L, 1L, 7L, 6L, 7L, 1L,
3L, 17L, 1L, 3L, 6L, 1L, 4L, 11L, 1L, 5L, 1L, 5L, 1L, 1L,
15L, 4L, 7L, 3L, 1L, 4L, 12L, 8L, 1L, 9L, 32L, 3L, 7L, 5L,
35L, 1L, 1L, 3L, 1L, 6L, 4L, 4L, 12L, 2L, 5L, 4L, 2L, 2L,
9L, 1L, 2L, 3L, 4L, 9L, 13L, 2L, 1L, 25L, 25L, 10L, 14L,
10L, 4L, 59L, 4L, 5L, 21L, 19L, 1L, 8L, 27L, 3L, 5L, 8L,
8L, 11L, 12L, 111L, 5L, 50L, 45L, 15L, 32L, 10L, 49L, 109L,
1L, 8L, 28L, 39L, 53L, 13L, 48L, 5L, 13L, 2L, 20L, 3L, 3L,
27L, 10L, 8L, 1L, 58L, 1L, 7L, 32L, 13L, 21L, 110L, 1L, 17L,
27L, 124L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 2L, 7L, 1L, 1L, 1L,
2L, 2L, 1L, 5L, 2L, 2L, 2L, 1L, 3L, 3L, 14L, 2L, 2L, 4L,
1L, 3L, 14L, 5L, 8L, 44L, 16L, 14L, 4L, 12L, 4L, 19L, 41L,
47L, 2L, 1L, 11L, 24L, 2L, 18L, 1L, 7L, 5L, 1L, 7L, 3L, 27L,
3L, 15L, 7L, 54L, 1L, 4L, 17L, 5L, 6L, 27L, 1L, 1L, 2L, 3L,
4L, 10L, 56L, 3L, 25L, 25L, 7L, 16L, 5L, 29L, 59L, 3L, 3L,
20L, 17L, 5L, 31L, 3L, 6L, 1L, 4L, 7L, 1L, 3L, 1L, 6L, 5L,
13L, 1L, 2L, 9L, 1L, 15L, 2L, 1L, 16L, 4L, 4L, 3L, 1L, 6L,
17L, 10L, 1L, 13L, 63L, 11L, 12L, 1L, 5L, 1L, 2L, 3L)), row.names = c(NA,
-383L), class = c("tbl_df", "tbl", "data.frame"))
routes_tidy <- tbl_graph(nodes=nodes, edges=edges, directed=FALSE) %>% mutate(neighbors = centrality_degree())
# Filtering out 3 nodes out of the graph as they have no connections and zoom the figure way out
ggraph(routes_tidy, layout="graphopt") +
geom_node_point(aes(size=neighbors, filter=(label!="z" & label!="s" & label!="w"))) +
geom_edge_link(aes(width=weight, alpha=weight)) +
scale_edge_width(range=c(0.2, 2)) +
geom_node_text(aes(label=label, fontface="bold", size=neighbors, filter=(label!="z" & label!="s" & label!="w")), repel=TRUE) +
labs(edge_width="N") +
theme_graph()
I'm new to the whole tidygraph thing, stumbled over this question, got confused, and figured it'd be a nice way to get to know stuff. So, I don't know if it's a bug or a feature, but the behaviour is triggered because you have doubled edges:
# Given your edges
edges %>%
filter((from == 1 & to == 2) | from == 2 & to == 1)
# A tibble: 2 x 3
from to weight
<int> <int> <int>
1 1 2 11
2 2 1 3
And those count as 2 connections in the calculation of the degree centrality. One way to remove those double edges is to convert the network to a simple network:
routes_simple <-
routes_tidy %>%
morph(to_simple) %>%
crystallise() %>%
pull(graph) %>%
getElement(1) %>%
activate(nodes) %>%
mutate(neighbors = centrality_degree())
Now the maximum degree is 22 (and the heighest possible, presumably, 25).

How do I sum a column based on another column?

Assuming that the dataframe is stored as fruit, and is in the following format:
State Fruit Category Fruit Type Gross Value
ACT CitrusFruit Mandarins $4,500,000
ACT CitrusFruit Oranges
NSW PomeFruit Apple $139,130,203.50
NSW Grapes Wine Production $50,000,000
NSW OrchardStoneFruit Avocados $10,031,123
QLD CitrusFruit Oranges
How would I sum the gross value, based on the State - while excluding blank values. But at the same time, the gross value of each state should be summed, rather than displayed separately for CitrusFruit, PomeFruit, etc.
I have tried to use the
library(plyr)
counts
method to no avail.
Any help would be greatly appreciated.
EDIT:
I have tried to use the following method:
library(dplyr)
fruit %>%
group_by(State) %>%
summarise(Gross = sum(Gross))
However, I am getting an error that says:
Evaluation Error: 'sum' not meaningful for factors.
EDIT:
Output from dput(fruit)
structure(list(State = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L), .Label = c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC",
"WA"), class = "factor"), Fruit.Category = structure(c(6L, 6L,
6L, 8L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L), .Label = c(" Grapes ", " OrchardStoneFruit ", " OtherFruit ",
" PomeFruit ", " CitrusFruit ", " CitrusFruit ", " Grapes ",
" Grapes ", " OrchardStoneFruit ", " OtherFruit ", " PomeFruit "
), class = "factor"), Fruit.Type = structure(c(5L, 8L, 13L, 18L,
31L, 2L, 4L, 6L, 7L, 9L, 14L, 17L, 3L, 11L, 12L, 15L, 1L, 10L,
16L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L,
13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 18L, 31L, 18L, 31L,
18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 14L, 17L, 20L,
22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L,
20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L,
17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L,
14L, 17L, 20L, 22L, 24L, 25L, 27L, 15L, 21L, 29L, 30L, 15L, 21L,
29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L,
30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 16L, 19L, 28L, 16L,
19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L,
28L, 16L, 19L, 28L), .Label = c(" Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ",
" Nectarines ", " Oranges ", " Peaches ", " Pears ",
" Pineapples ", " Strawberries ", " AllOtherCitrusFruit ",
" AllOtherOrchardFruit ", " AllOtherOtherFruit ", " AllOtherPomeFruit ",
" AllOtherStoneFruit ", " AllOtherUses ", " Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ", " Nectarines ",
" Oranges ", " Peaches ", " Pears ", " Pineapples ", " Strawberries ",
" WineProduction "), class = "factor"), Gross.Value = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 60L, 97L, 23L, 104L, 1L, 1L, 56L, 98L, 36L, 101L, 68L,
11L, 1L, 1L, 1L, 91L, 96L, 57L, 99L, 92L, 21L, 71L, 29L, 48L,
1L, 76L, 51L, 46L, 58L, 1L, 34L, 37L, 14L, 22L, 70L, 18L, 59L,
28L, 32L, 41L, 83L, 61L, 69L, 30L, 1L, 1L, 26L, 1L, 1L, 25L,
35L, 19L, 2L, 80L, 9L, 8L, 7L, 102L, 47L, 31L, 1L, 85L, 75L,
1L, 88L, 93L, 52L, 1L, 66L, 50L, 100L, 43L, 89L, 95L, 2L, 82L,
65L, 5L, 24L, 94L, 33L, 64L, 10L, 90L, 78L, 84L, 62L, 3L, 86L,
20L, 73L, 1L, 38L, 67L, 72L, 15L, 63L, 1L, 1L, 39L, 17L, 1L,
1L, 16L, 40L, 1L, 1L, 103L, 79L, 49L, 1L, 44L, 6L, 105L, 53L,
1L, 1L, 1L, 1L, 81L, 54L, 27L, 87L, 13L, 1L, 55L, 106L, 4L, 42L,
12L, 45L, 77L, 74L), .Label = c("", "$0.00", "$1,025,861.63",
"$1,107,476.82", "$1,135,055.74", "$1,148,385.97", "$1,514,089.93",
"$1,539,762.85", "$1,565,234.83", "$10,469,580.98", "$100,622,922.20",
"$106,039,956.40", "$11,648,561.35", "$113,930,475.80", "$114,195,162.80",
"$12,169,338.44", "$12,492,792.64", "$12,843,528.01", "$120,877,197.60",
"$13,245.08", "$13,331,668.11", "$13,981,075.51", "$130,258,416.50",
"$14,203,578.43", "$14,697,408.09", "$15,085,825.24", "$15,196.71",
"$15,246,349.76", "$154,858,589.30", "$168,325.78", "$17,661,100.37",
"$18,278,371.16", "$188,414.59", "$19,896,312.15", "$2,370,402.03",
"$2,557,589.86", "$209,648,663.50", "$21,426,350.11", "$22,482,034.46",
"$23,929,331.35", "$238,668.61", "$249,675,376.10", "$26,669,599.23",
"$27,540,236.71", "$270,903.84", "$3,485,520.14", "$3,520,605.89",
"$3,659,706.68", "$3,829,198.67", "$301,644.66", "$301,976.25",
"$31,133,715.88", "$313,144.86", "$334,363.30", "$35,212,772.81",
"$37,927,507.70", "$38,989,343.33", "$385,858,491.60", "$4,447,813.26",
"$4,549,208.46", "$4,569,373.00", "$4,702.20", "$4,712,329.56",
"$4,995,833.14", "$40,133,037.39", "$40,481.05", "$435,712,531.70",
"$44,434,103.55", "$443,017.10", "$45,665,029.35", "$45,888,545.67",
"$46,638,011.92", "$47,589.51", "$5,793,841.42", "$5,854,982.37",
"$51,534,636.09", "$53,367,548.56", "$53,377,925.45", "$555,799.71",
"$57,522,144.94", "$57,930,562.37", "$58,316,912.75", "$6,170,170.78",
"$6,791,088.95", "$6,824,520.08", "$623,030.52", "$63,493,163.21",
"$664,237.23", "$7,066,407.60", "$7,168,380.92", "$7,364,245.36",
"$7,426,224.28", "$7,894.54", "$70,218,810.35", "$76,591,000.57",
"$8,596,626.45", "$8,713,417.54", "$85,876,834.41", "$873,748.40",
"$9,262,889.69", "$9,731,658.36", "$9,991,440.81", "$91,781,453.44",
"$92,299.72", "$95,677,012.68", "$983,780.33"), class = "factor")), class = "data.frame", row.names = c(NA,
-152L))
A couple of problems here:
You don't have Gross Value in your data, you have Gross.Value.
That column is factor, which is a more storage-efficient form of strings. Neither factor nor character can be summed. R knows nothing about accounting so the "$" means nothing to it in that context.
Try this:
library(dplyr)
someData %>%
mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", as.character(Gross.Value)))) %>%
group_by(State) %>%
summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE))
# # A tibble: 8 x 2
# State Gross.Value
# <fct> <dbl>
# 1 ACT 0
# 2 NSW 564400574.
# 3 NT 20133040.
# 4 QLD 1053007677.
# 5 SA 691850721.
# 6 TAS 112902970.
# 7 VIC 1069102796.
# 8 WA 281014929.
The only changes from my comment were (1) using the correct column name, and (2) adding na.rm=TRUE, since you have many blanks. This means you need to be careful how you use this data, as you now have biases and inaccuracies in your summary.
You should convert the factor to numeric and then sum. Here is the solution I came up with:
library(tidyverse)
##This line converts the factor into a numeric variable, by making it a character and then removing the commas and the dollar sign. Finally it converts to number
fruit$`Gross Value` <- as.numeric(str_replace_all(as.character(fruit$`Gross Value`),"\\$|\\,",""))
##Then you can run your sum function
fruit %>%
group_by(State) %>%
summarise(Gross = sum(`Gross Value`, na.rm = TRUE))

"minimum count is not zero" error for zero inflated model

Here is the data of my regression :
y is the number of passengers at platform of the train station in each 2 minutes period while A1 to A17 are the number of passengers at 17 study areas on concourse. Time lag has already between considered by shifting the Xs.
Since sometimes, there will be no one waiting in the study areas on concourse, so excess zero occurs. I am planing to use zero inflated model. I have tried the code as shown between, but it said "minimum count is not zero" What does that mean and how can i solve it? I have done poisson and it's alright but zero inflated doesn't work.
> setwd('C:/Users/zuzymelody/Desktop')
> try<-read.csv('0inflated_2mins27peak.csv',header=TRUE)
> attach(try)
> names(try)
[1] "y" "A1" "A2" "A3" "A4" "A5" "A6" "A7" "A8" "A9" "A10" "A11"
[13] "A12" "A13" "A14" "A15" "A16" "A17"
> model1<-glm(y~A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12+A13+A14+A15+A16+A17,family="poisson")
> summary(model1)
Call:
glm(formula = y ~ A1 + A2 + A3 + A4 + A5 + A6 + A7 + A8 + A9 +
A10 + A11 + A12 + A13 + A14 + A15 + A16 + A17, family = "poisson")
Deviance Residuals:
Min 1Q Median 3Q Max
-7.8598 -3.4571 -0.3663 2.1867 12.5183
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.102009 0.164497 37.095 < 2e-16 ***
A1 -0.017555 0.003665 -4.790 1.66e-06 ***
A2 -0.026101 0.017569 -1.486 0.137371
A3 -0.179988 0.014976 -12.018 < 2e-16 ***
A4 -0.032584 0.007735 -4.213 2.52e-05 ***
A5 -0.019908 0.007014 -2.839 0.004532 **
A6 -0.044144 0.010266 -4.300 1.71e-05 ***
A7 0.049829 0.006518 7.645 2.09e-14 ***
A8 -0.080712 0.009819 -8.220 < 2e-16 ***
A9 0.007390 0.007105 1.040 0.298273
A10 0.041116 0.004085 10.065 < 2e-16 ***
A11 -0.041420 0.008418 -4.921 8.62e-07 ***
A12 -0.008241 0.007304 -1.128 0.259171
A13 -0.033161 0.008966 -3.699 0.000217 ***
A14 0.020818 0.005250 3.965 7.34e-05 ***
A15 -0.002995 0.006125 -0.489 0.624887
A16 -0.061997 0.017122 -3.621 0.000294 ***
A17 -0.025025 0.008391 -2.982 0.002860 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1137.71 on 29 degrees of freedom
Residual deviance: 599.74 on 12 degrees of freedom
AIC: 840.1
Number of Fisher Scoring iterations: 5
>with(model1, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
res.deviance df p
[1,] 599.7445 12 1.202013e-120
> require( pscl )
> Zip<-zeroinfl(model1,link="logit",dist="poisson")
**Error in zeroinfl(model1, link = "logit", dist = "poisson") :
invalid dependent variable, minimum count is not zero**
dput(try)
structure(list(y = c(156L, 74L, 221L, 207L, 168L, 36L, 128L,
208L, 99L, 117L, 228L, 211L, 341L, 173L, 196L, 310L, 112L, 203L,
104L, 183L, 325L, 143L, 218L, 166L, 218L, 127L, 136L, 38L, 102L,
34L), A1 = c(24L, 24L, 24L, 19L, 20L, 9L, 14L, 23L, 15L, 23L,
14L, 16L, 15L, 25L, 25L, 19L, 24L, 26L, 25L, 26L, 22L, 14L, 13L,
15L, 9L, 12L, 9L, 12L, 15L, 18L), A2 = c(2L, 4L, 0L, 3L, 0L,
1L, 1L, 2L, 1L, 2L, 0L, 2L, 2L, 0L, 1L, 1L, 3L, 3L, 2L, 2L, 3L,
2L, 3L, 5L, 4L, 3L, 4L, 1L, 2L, 1L), A3 = c(2L, 2L, 0L, 1L, 1L,
9L, 3L, 0L, 0L, 0L, 1L, 1L, 3L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 2L), A4 = c(15L, 11L, 6L, 7L,
10L, 10L, 5L, 4L, 5L, 7L, 9L, 9L, 4L, 6L, 6L, 13L, 9L, 13L, 9L,
10L, 6L, 6L, 7L, 6L, 10L, 9L, 10L, 7L, 9L, 2L), A5 = c(13L, 10L,
6L, 6L, 11L, 19L, 13L, 14L, 7L, 7L, 6L, 8L, 10L, 5L, 7L, 9L,
9L, 11L, 3L, 13L, 8L, 8L, 8L, 6L, 8L, 9L, 9L, 14L, 9L, 6L), A6 = c(9L,
10L, 9L, 9L, 4L, 7L, 7L, 12L, 11L, 11L, 12L, 8L, 6L, 7L, 8L,
5L, 9L, 6L, 5L, 6L, 9L, 11L, 6L, 6L, 8L, 9L, 4L, 11L, 10L, 7L
), A7 = c(21L, 16L, 13L, 13L, 4L, 9L, 12L, 13L, 12L, 12L, 12L,
6L, 7L, 6L, 6L, 4L, 5L, 9L, 8L, 7L, 9L, 12L, 10L, 7L, 8L, 12L,
14L, 2L, 6L, 6L), A8 = c(1L, 5L, 10L, 10L, 1L, 9L, 6L, 6L, 7L,
7L, 5L, 6L, 3L, 2L, 4L, 0L, 4L, 2L, 5L, 5L, 5L, 3L, 2L, 4L, 3L,
8L, 10L, 8L, 2L, 5L), A9 = c(8L, 9L, 10L, 10L, 12L, 19L, 10L,
6L, 6L, 6L, 0L, 6L, 8L, 10L, 2L, 3L, 6L, 2L, 2L, 6L, 5L, 2L,
4L, 1L, 3L, 7L, 7L, 4L, 4L, 2L), A10 = c(7L, 10L, 12L, 20L, 24L,
21L, 24L, 18L, 20L, 18L, 26L, 21L, 12L, 11L, 18L, 18L, 19L, 16L,
25L, 21L, 22L, 14L, 12L, 17L, 21L, 14L, 14L, 10L, 8L, 7L), A11 = c(0L,
2L, 1L, 4L, 2L, 1L, 1L, 1L, 13L, 10L, 12L, 5L, 2L, 0L, 5L, 1L,
4L, 4L, 3L, 3L, 1L, 1L, 3L, 3L, 5L, 5L, 2L, 10L, 3L, 4L), A12 = c(12L,
14L, 14L, 17L, 10L, 14L, 13L, 19L, 7L, 5L, 6L, 6L, 8L, 7L, 13L,
11L, 10L, 8L, 6L, 6L, 9L, 14L, 9L, 10L, 8L, 9L, 8L, 9L, 5L, 7L
), A13 = c(6L, 2L, 1L, 5L, 9L, 6L, 7L, 4L, 12L, 5L, 9L, 10L,
3L, 7L, 4L, 2L, 2L, 6L, 4L, 6L, 7L, 4L, 9L, 6L, 11L, 4L, 5L,
4L, 6L, 6L), A14 = c(14L, 13L, 16L, 11L, 8L, 6L, 9L, 13L, 14L,
14L, 9L, 8L, 12L, 11L, 13L, 11L, 18L, 15L, 20L, 21L, 17L, 18L,
18L, 18L, 25L, 20L, 12L, 9L, 8L, 8L), A15 = c(7L, 6L, 7L, 5L,
4L, 9L, 12L, 12L, 11L, 12L, 9L, 8L, 7L, 8L, 10L, 16L, 8L, 8L,
13L, 10L, 5L, 5L, 8L, 10L, 10L, 4L, 6L, 6L, 6L, 7L), A16 = c(2L,
1L, 3L, 3L, 1L, 2L, 3L, 2L, 3L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 2L,
1L, 3L, 4L, 2L, 5L, 4L, 8L, 5L, 2L, 1L, 2L, 2L, 2L), A17 = c(10L,
13L, 13L, 2L, 5L, 1L, 3L, 3L, 5L, 4L, 4L, 6L, 4L, 6L, 3L, 2L,
2L, 2L, 7L, 8L, 3L, 7L, 5L, 6L, 7L, 6L, 6L, 3L, 4L, 3L)), .Names = c("y",
"A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10",
"A11", "A12", "A13", "A14", "A15", "A16", "A17"), class = "data.frame", row.names = c(NA,
-30L))
above is the reproducible example. Sorry its my first time to post here, dont know the rule well
Your data frame does not contain a zero value in your dependent variable $y$:
min(mydata$y)
[1] 34
You'll need to have at least one $y = 0$.

R: Error when using knncat to do classification on categorical variable

I have a data set with 4 columns, 2 of which are numeric, 1 is categorical and 1 is the label. The label has 13 levels (A to M). I tried to use knncat package in R to do classification, but every time I ran the code, I got the following error message:
Error in `[<-.data.frame`(`*tmp*`, factor.vars, value = c("M", "J", "K", :
replacement has 45500 rows, data has 1
The following is the code I used:
data <- read.csv('mosaic_data2.csv', header = T)
num <- dim(data)[1]
library(sampling)
set.seed(1234)
train_index <- sample(seq(1,num,1), floor(num * 0.7), replace = F)
test_index <- setdiff(seq(1,num,1), train_index)
train_data <- data[train_index,]
test_data <- data[test_index,]
library(knncat)
model <- knncat(train_data, classcol = 2)
Could anyone please take a look at the code and advise how I could eliminate this bug? Thank you very much!
The output of dput(head(data,100)) is as follows:
structure(list(latitude = c(52.7326028, 52.74287543, 52.82107841,
52.82025363, 52.81980596, 52.81721897, 52.81274172, 52.81274172,
52.8089586, 52.81424219, 52.8089586, 52.74007929, 52.77394023,
52.73659034, 52.73672518, 52.73764626, 52.73753744, 52.73659034,
52.73815233, 52.73679388, 52.73890319, 52.71697237, 52.63730282,
52.62720385, 52.63730282, 52.63543017, 52.63768035, 52.63510366,
52.6346578, 52.6346578, 52.6346578, 52.63447454, 52.63576418,
52.63447454, 52.6346578, 52.63447454, 52.69820719, 52.69603926,
52.68246919, 52.54600173, 52.54210198, 52.60628983, 52.61003275,
52.60278236, 52.60239604, 52.60348688, 52.60239604, 52.60382146,
52.60315644, 52.86047938, 52.86576353, 52.86954228, 52.81039471,
52.82094872, 52.82395073, 52.82444705, 52.88098384, 52.88469208,
52.88469208, 52.84979201, 52.84720159, 52.84831759, 52.82435938,
52.82319493, 52.82168337, 52.8230402, 52.8230402, 52.82513486,
52.82472379, 52.82756385, 52.82475438, 52.82434902, 52.82166611,
52.823712, 52.82401481, 52.82483489, 52.82103704, 52.82060763,
52.8208682, 52.82211317, 52.81868547, 52.8198332, 52.82023595,
52.81989134, 52.8196971, 52.82051066, 52.82463338, 52.82539131,
52.82580625, 52.82509199, 52.83759415, 52.83946254, 52.83946254,
52.83891871, 52.83821538, 52.84757879, 52.84663773, 52.8449371,
52.84592185, 52.84331619), longitude = c(-6.892397941, -6.915346343,
-6.922554014, -6.924997835, -6.926099967, -6.883340697, -6.897757597,
-6.897757597, -6.895500952, -6.883129556, -6.895500952, -6.703781864,
-6.680851783, -6.771845364, -6.773301282, -6.772958488, -6.77484647,
-6.771845364, -6.773422218, -6.772164896, -6.770622695, -6.784187251,
-6.901922588, -6.905109015, -6.901922588, -6.976679508, -6.973114498,
-6.974753462, -6.947990431, -6.947990431, -6.947990431, -6.976921427,
-6.958295227, -6.976921427, -6.947990431, -6.976921427, -6.902010609,
-6.915233457, -6.871160885, -6.832461149, -6.862126342, -6.943925285,
-6.93813643, -6.925128034, -6.932247524, -6.93461305, -6.932247524,
-6.934657053, -6.929283954, -6.845259603, -6.861188287, -6.866476268,
-6.940851164, -6.939203401, -6.930506188, -6.933317462, -6.929441954,
-6.922589037, -6.922589037, -6.926037258, -6.929423169, -6.917829279,
-6.938211918, -6.940658091, -6.940651748, -6.940107883, -6.940107883,
-6.938704642, -6.939084526, -6.933331264, -6.937496468, -6.937678962,
-6.940276221, -6.94018054, -6.939876475, -6.938983181, -6.934235666,
-6.93387209, -6.933134226, -6.934193569, -6.934383596, -6.933832641,
-6.937454656, -6.933818238, -6.93443811, -6.936913947, -6.920030341,
-6.920400963, -6.92215006, -6.910771124, -6.901500591, -6.899018998,
-6.899018998, -6.903007684, -6.90119821, -6.91063672, -6.909935672,
-6.90240965, -6.900066763, -6.901411136), mosaic_group = structure(c(10L,
10L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 10L, 10L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 10L, 10L, 10L, 13L, 13L, 13L, 13L,
9L, 6L, 6L, 6L, 6L, 6L, 10L, 8L, 8L, 9L, 9L, 9L, 9L, 7L, 7L,
7L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 8L, 8L, 8L, 8L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 8L,
6L, 6L, 6L, 6L, 6L, 8L, 8L, 10L, 10L, 10L), .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"), class = "factor"),
small_code = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 14L, 14L, 14L, 14L, 14L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 28L, 28L, 28L
)), .Names = c("latitude", "longitude", "mosaic_group", "small_code"
), row.names = c(NA, 100L), class = "data.frame")
The function knncat::knncat accepts the argument classcol which is defined as:
Column with classification in it. Default: 1.
You have a data set with structure:
latitude longitude mosaic_group small_code
1 52.73260 -6.892398 J 1
2 52.74288 -6.915346 J 1
3 52.82108 -6.922554 H 2
4 52.82025 -6.924998 H 2
5 52.81981 -6.926100 H 2
6 52.81722 -6.883341 G 3
Therefore your argument should be classcol = 3 (or 4) I am assuming, but we can see that it certainly shouldn't be classcol = 2.

Formatting x-axis with histogram in R

I want x- axis from 1 to 20 and y-axis from 1 to 6.
My data:
structure(list(HEI.ID = structure(c(12L, 9L, 14L, 19L, 20L, 1L,
7L, 5L, 11L, 3L, 10L, 18L, 2L, 8L, 6L, 15L, 13L, 17L, 4L, 16L
), .Label = c("BF", "CC", "DC", "ER", "IM", "MC", "ME ",
"MM", "MO", "OC", "OM", "OP", "SB", "SD", "SH", "SL", "SN", "TH",
"UN", "WS"), class = "factor"), X2007 = c(18L, 14L, 15L, 20L,
12L, 6L, 17L, 2L, 4L, 11L, 16L, 1L, 9L, 8L, 13L, 4L, 10L, 6L,
3L, 19L), X2008 = c(20L, 9L, 16L, 18L, 8L, 17L, 15L, 6L, 3L,
14L, 19L, 1L, 2L, 12L, 5L, 13L, 11L, 7L, 4L, 10L), X2009 = c(20L,
13L, 17L, 8L, 4L, 9L, 19L, 12L, 2L, 11L, 16L, 1L, 2L, 7L, 6L,
18L, 5L, 15L, 9L, 14L), X2010 = c(20L, 13L, 16L, 13L, 7L, 15L,
19L, 8L, 3L, 9L, 18L, 1L, 5L, 11L, 12L, 6L, 10L, 4L, 2L, 17L),
X2011 = c(20L, 2L, 16L, 14L, 6L, 10L, 17L, 8L, 3L, 15L, 19L,
1L, 4L, 18L, 13L, 11L, 8L, 12L, 4L, 7L), X2012 = c(20L, 12L,
19L, 13L, 8L, 14L, 15L, 10L, 11L, 9L, 17L, 2L, 7L, 18L, 5L,
16L, 3L, 4L, 6L, 1L)), .Names = c("HEI.ID", "X2007", "X2008",
"X2009", "X2010", "X2011", "X2012"), row.names = c(NA, -20L), class = "data.frame")
I use the following commands to draw histograms:
par(mfrow = c(3,4))
for(i in 1:20){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlab = STOF[i,1],cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1)
}
But after using above commands, I get different number in x- axis and y-axis.
I don't understand what your plot would looks like. It's not clear from your question and data provided.
I've tried to plot it. Please comment if you think it's the way to go.
Considering dt is your data.frame
library(reshape)
dt <- melt(dt)
library(ggplot2)
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt) +
geom_bar(stat = 'identity')
or
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt1) +
geom_bar(stat = 'identity') +
facet_grid(variable ~.)
You could use xlim and ylim parameters in the hist function and control the axes using
axis:
par(mfrow = c(3,4))
for(i in 1:12){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlim=c(0, 21), ylim=c(0,6), xaxt='n', yaxt='n')
axis(1, at=c(0, 10, 20))
axis(2, at=0:6)
}
Do you really want your y-axis to go from 1 to 6? This will cut off parts of the bars.
Also, you iterate over all 20 rows for a grid with 12 plots. The code above gives the following plot:

Resources