Related
I have a cleaned data set with 26 nodes. I am placing these 26 nodes in an undirected network graph using tidygraph, where I use the centrality_degree() function to calculate the centrality degree. However, when I graph the resulting network, my highest possible centrality degree is 40, which should not be possible. When I change the graph to directed, this is corrected.
I somewhat confused, as other methods I have used in the past, where I manually calculated the centrality degree, I have never once come across this issue.
Is this regular behaviour, or am I doing something wrong?
Reproducible example:
library(tidygraph)
library(ggraph)
library(tidyverse)
nodes <- structure(list(id = 1:26, label = c("a", "b", "c", "d", "e",
"f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
"s", "t", "u", "v", "w", "x", "y", "z")), row.names = c(NA, -26L
), class = "data.frame")
edges <- structure(list(from = c(21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 17L, 17L, 17L, 17L, 17L),
to = c(1L, 12L, 3L, 16L, 24L, 4L, 10L, 6L, 22L, 2L, 8L, 1L,
12L, 13L, 3L, 18L, 16L, 24L, 5L, 7L, 14L, 4L, 10L, 6L, 9L,
22L, 15L, 2L, 20L, 8L, 21L, 12L, 13L, 3L, 16L, 24L, 5L, 7L,
14L, 4L, 10L, 6L, 22L, 15L, 2L, 8L, 17L, 21L, 1L, 13L, 3L,
16L, 5L, 7L, 14L, 10L, 6L, 9L, 22L, 15L, 2L, 20L, 8L, 17L,
21L, 1L, 3L, 18L, 16L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L,
22L, 15L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 18L, 16L,
24L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L, 22L, 15L, 20L, 8L,
17L, 1L, 3L, 10L, 6L, 22L, 20L, 8L, 21L, 11L, 1L, 13L, 3L,
18L, 24L, 7L, 4L, 10L, 6L, 25L, 9L, 22L, 15L, 2L, 20L, 8L,
17L, 21L, 11L, 1L, 12L, 13L, 18L, 16L, 5L, 7L, 14L, 10L,
6L, 25L, 9L, 22L, 15L, 20L, 8L, 17L, 1L, 3L, 18L, 16L, 7L,
14L, 4L, 10L, 6L, 9L, 22L, 15L, 2L, 20L, 8L, 17L, 21L, 11L,
1L, 12L, 13L, 3L, 18L, 16L, 24L, 14L, 4L, 10L, 6L, 25L, 9L,
22L, 15L, 2L, 20L, 8L, 11L, 1L, 3L, 18L, 16L, 7L, 10L, 6L,
9L, 22L, 15L, 2L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L,
18L, 16L, 24L, 5L, 7L, 14L, 10L, 6L, 25L, 9L, 22L, 15L, 2L,
20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L, 18L, 16L, 24L,
5L, 7L, 14L, 4L, 6L, 25L, 9L, 22L, 15L, 2L, 20L, 8L, 17L,
21L, 11L, 1L, 12L, 13L, 3L, 18L, 24L, 5L, 7L, 14L, 4L, 10L,
25L, 9L, 22L, 15L, 2L, 20L, 8L, 21L, 1L, 13L, 3L, 18L, 5L,
10L, 6L, 22L, 2L, 20L, 8L, 21L, 1L, 13L, 3L, 18L, 16L, 24L,
4L, 10L, 6L, 22L, 15L, 2L, 20L, 8L, 11L, 1L, 12L, 13L, 3L,
16L, 24L, 5L, 7L, 14L, 4L, 10L, 6L, 25L, 9L, 15L, 2L, 20L,
8L, 17L, 21L, 1L, 12L, 3L, 18L, 16L, 24L, 7L, 10L, 6L, 25L,
9L, 22L, 2L, 20L, 8L, 17L, 21L, 11L, 1L, 12L, 13L, 3L, 18L,
16L, 24L, 5L, 7L, 14L, 4L, 6L, 25L, 9L, 22L, 15L, 20L, 8L,
17L, 21L, 11L, 1L, 3L, 16L, 24L, 7L, 10L, 6L, 22L, 2L, 8L,
21L, 11L, 1L, 12L, 13L, 3L, 18L, 16L, 24L, 14L, 4L, 10L,
6L, 25L, 9L, 22L, 2L, 20L, 7L, 6L, 25L, 22L, 8L), weight = c(3L,
1L, 3L, 2L, 1L, 1L, 5L, 1L, 8L, 2L, 1L, 2L, 3L, 2L, 5L, 1L,
4L, 1L, 4L, 4L, 4L, 1L, 5L, 13L, 3L, 7L, 3L, 2L, 3L, 8L,
1L, 1L, 1L, 15L, 10L, 7L, 2L, 4L, 2L, 5L, 19L, 23L, 6L, 2L,
11L, 7L, 1L, 1L, 2L, 3L, 3L, 5L, 4L, 5L, 4L, 4L, 21L, 2L,
9L, 8L, 1L, 1L, 12L, 1L, 2L, 1L, 3L, 1L, 6L, 6L, 5L, 6L,
1L, 6L, 22L, 2L, 2L, 9L, 8L, 3L, 13L, 1L, 5L, 6L, 4L, 10L,
13L, 3L, 41L, 46L, 11L, 39L, 9L, 55L, 2L, 108L, 2L, 8L, 31L,
30L, 13L, 39L, 2L, 2L, 1L, 3L, 4L, 8L, 5L, 1L, 8L, 1L, 6L,
1L, 8L, 2L, 3L, 23L, 2L, 12L, 96L, 1L, 3L, 21L, 1L, 6L, 12L,
38L, 4L, 5L, 4L, 4L, 8L, 8L, 3L, 29L, 3L, 11L, 3L, 3L, 63L,
2L, 5L, 18L, 19L, 4L, 25L, 1L, 2L, 3L, 1L, 7L, 6L, 7L, 1L,
3L, 17L, 1L, 3L, 6L, 1L, 4L, 11L, 1L, 5L, 1L, 5L, 1L, 1L,
15L, 4L, 7L, 3L, 1L, 4L, 12L, 8L, 1L, 9L, 32L, 3L, 7L, 5L,
35L, 1L, 1L, 3L, 1L, 6L, 4L, 4L, 12L, 2L, 5L, 4L, 2L, 2L,
9L, 1L, 2L, 3L, 4L, 9L, 13L, 2L, 1L, 25L, 25L, 10L, 14L,
10L, 4L, 59L, 4L, 5L, 21L, 19L, 1L, 8L, 27L, 3L, 5L, 8L,
8L, 11L, 12L, 111L, 5L, 50L, 45L, 15L, 32L, 10L, 49L, 109L,
1L, 8L, 28L, 39L, 53L, 13L, 48L, 5L, 13L, 2L, 20L, 3L, 3L,
27L, 10L, 8L, 1L, 58L, 1L, 7L, 32L, 13L, 21L, 110L, 1L, 17L,
27L, 124L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 2L, 7L, 1L, 1L, 1L,
2L, 2L, 1L, 5L, 2L, 2L, 2L, 1L, 3L, 3L, 14L, 2L, 2L, 4L,
1L, 3L, 14L, 5L, 8L, 44L, 16L, 14L, 4L, 12L, 4L, 19L, 41L,
47L, 2L, 1L, 11L, 24L, 2L, 18L, 1L, 7L, 5L, 1L, 7L, 3L, 27L,
3L, 15L, 7L, 54L, 1L, 4L, 17L, 5L, 6L, 27L, 1L, 1L, 2L, 3L,
4L, 10L, 56L, 3L, 25L, 25L, 7L, 16L, 5L, 29L, 59L, 3L, 3L,
20L, 17L, 5L, 31L, 3L, 6L, 1L, 4L, 7L, 1L, 3L, 1L, 6L, 5L,
13L, 1L, 2L, 9L, 1L, 15L, 2L, 1L, 16L, 4L, 4L, 3L, 1L, 6L,
17L, 10L, 1L, 13L, 63L, 11L, 12L, 1L, 5L, 1L, 2L, 3L)), row.names = c(NA,
-383L), class = c("tbl_df", "tbl", "data.frame"))
routes_tidy <- tbl_graph(nodes=nodes, edges=edges, directed=FALSE) %>% mutate(neighbors = centrality_degree())
# Filtering out 3 nodes out of the graph as they have no connections and zoom the figure way out
ggraph(routes_tidy, layout="graphopt") +
geom_node_point(aes(size=neighbors, filter=(label!="z" & label!="s" & label!="w"))) +
geom_edge_link(aes(width=weight, alpha=weight)) +
scale_edge_width(range=c(0.2, 2)) +
geom_node_text(aes(label=label, fontface="bold", size=neighbors, filter=(label!="z" & label!="s" & label!="w")), repel=TRUE) +
labs(edge_width="N") +
theme_graph()
I'm new to the whole tidygraph thing, stumbled over this question, got confused, and figured it'd be a nice way to get to know stuff. So, I don't know if it's a bug or a feature, but the behaviour is triggered because you have doubled edges:
# Given your edges
edges %>%
filter((from == 1 & to == 2) | from == 2 & to == 1)
# A tibble: 2 x 3
from to weight
<int> <int> <int>
1 1 2 11
2 2 1 3
And those count as 2 connections in the calculation of the degree centrality. One way to remove those double edges is to convert the network to a simple network:
routes_simple <-
routes_tidy %>%
morph(to_simple) %>%
crystallise() %>%
pull(graph) %>%
getElement(1) %>%
activate(nodes) %>%
mutate(neighbors = centrality_degree())
Now the maximum degree is 22 (and the heighest possible, presumably, 25).
I need some help regarding transforming a geom_bar into a geom_area plot. This is my df:
dput(df)
df <- structure(list(new_day = c(-25L, 3L, 7L, -7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -7L,
0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L,
-25L, 3L, 7L, -7L, 0L, 3L, -7L, 0L, -25L, 7L, 3L, 7L, -7L, 0L,
-25L, 3L, 7L, -7L, 0L, -25L, 3L, 7L, 3L, 7L, -7L, 0L, -25L, 3L,
7L, -7L, 0L, 7L, -25L, 3L, 7L, -7L, 0L, 3L, 7L, -25L, -25L, -25L,
-25L, -25L, -25L, -25L), order = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 11L, 11L, 11L, 11L, 11L, 13L, 13L, 13L, 13L,
13L, 10L, 10L, 10L, 10L, 10L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L,
2L, 2L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L,
9L, 9L, 9L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 13L, 13L, 14L, 14L,
14L, 14L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 13L, 13L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 1L, 7L, 5L, 2L,
12L, 2L, 2L), .Label = c("Alteromonadales", "Betaproteobacteriales",
"Caulobacterales", "Chitinophagales", "Flavobacteriales", "Parvibaculales",
"Pseudomonadales", "Rhizobiales", "Rhodobacterales", "Rhodospirillales",
"Sneathiellales", "Sphingobacteriales", "Sphingomonadales", "Thalassobaculales"
), class = "factor"), family = structure(c(13L, 13L, 13L, 13L,
12L, 12L, 12L, 12L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 7L, 7L, 7L, 7L, 7L, 11L, 11L, 11L, 11L, 11L, 1L, 1L, 1L,
1L, 1L, 11L, 11L, 11L, 11L, 14L, 14L, 14L, 14L, 14L, 4L, 4L,
4L, 4L, 4L, 14L, 14L, 14L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 16L,
16L, 17L, 17L, 17L, 17L, 8L, 8L, 8L, 8L, 8L, 5L, 5L, 5L, 16L,
16L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L, 8L,
8L, 10L, 10L, 6L, 11L, 3L, 1L, 9L, 1L, 1L), .Label = c("Burkholderiaceae",
"Chitinophagaceae", "Flavobacteriaceae", "Gallaecimonadaceae",
"Hyphomonadaceae", "Idiomarinaceae", "Magnetospiraceae", "Methylophilaceae",
"NS11-12_marine_group", "Parvibaculaceae", "Pseudomonadaceae",
"Rhizobiaceae", "Rhizobiales_unclassified", "Rhodobacteraceae",
"Sneathiellaceae", "Sphingomonadaceae", "Thalassobaculaceae"), class = "factor"),
genus = structure(c(16L, 16L, 16L, 16L, 7L, 7L, 7L, 7L, 3L,
3L, 3L, 3L, 3L, 19L, 19L, 19L, 19L, 19L, 24L, 24L, 24L, 24L,
24L, 14L, 14L, 14L, 14L, 14L, 17L, 17L, 17L, 17L, 17L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 5L, 5L, 5L, 5L, 5L,
10L, 10L, 10L, 2L, 2L, 2L, 2L, 2L, 22L, 22L, 22L, 20L, 20L,
23L, 23L, 23L, 23L, 11L, 11L, 11L, 11L, 11L, 8L, 8L, 8L,
21L, 21L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 11L, 11L,
11L, 11L, 11L, 13L, 13L, 9L, 14L, 4L, 6L, 12L, 1L, 18L), .Label = c("Burkholderiaceae_unclassified",
"Cupriavidus", "Ferrovibrio", "Flavobacteriaceae_unclassified",
"Gallaecimonas", "GKS98_freshwater_group", "Hoeflea", "Hyphomonas",
"Idiomarina", "Marivivens", "Methylotenera", "NS11-12_marine_group_ge",
"Parvibaculum", "Pseudomonas", "Pseudorhodobacter", "Rhizobiales_unclassified",
"Rhodoferax", "RS62_marine_group", "Sphingomonadaceae_unclassified",
"Sphingopyxis", "Sphingorhabdus", "Terrimonas", "Thalassobaculum",
"uncultured"), class = "factor"), Abundance = c(0.758296593899054,
0.728046713738242, 0.421798852637834, 0.185971692147469,
7.36584152568739, 11.0004160226707, 1.93134577450352, 19.7144376530921,
46.2350237547082, 25.8715062086956, 22.1549641486618, 34.4112477828867,
20.4937613394223, 3.73518219692229, 15.9295990367068, 13.8490383262387,
13.3481723220855, 20.3866145291388, 0.165618346100574, 8.86991024549668,
8.5330814375361, 6.86819004205197, 5.72129192186814, 1.04512973253723,
3.77880217461655, 6.47871112880127, 1.12084852451492, 0.903754246093232,
19.0854333497858, 15.7152146349298, 12.3768753373503, 15.8790763239117,
10.2875187327705, 2.82159106304821, 4.22393981370602, 8.82452898193968,
4.8507226701533, 6.19619716749583, 8.28477594908417, 8.05201189383953,
9.7404731686272, 9.84535225459449, 1.7940554465653, 2.62276259756813,
2.74008811315788, 0.543937440677315, 0.55325167765205, 0.910457573040239,
0.451385497886567, 0.655661306732001, 6.59400178917785, 1.92570846362683,
2.62192443054515, 2.10049053655497, 2.13139299576524, 0.20799245164738,
0.324291631088576, 0.369492771993701, 1.52162438803598, 0.151864202275619,
0.420953084533189, 0.391517677365401, 0.29116200940885, 0.232440441774702,
4.21428798609281, 0.859779996836882, 1.33107018783728, 1.013155122065,
0.447286602320585, 0.165001492967355, 0.285983094976304,
0.377758692391269, 0.21556919104275, 0.314057858254493, 0.354649793637887,
0.338799824269294, 0.218027624939685, 0.914324162324944,
1.22932824654674, 0.731649603629864, 0.566393265064962, 0.247942012186621,
1.73171328618728, 0.636597714441988, 0.505393049999761, 0.491318560043637,
0.990988961717433, 0.195417142399681, 0.210412739808352,
0.476107780140271, 0.936663899397428, 0.251540964619117,
0.963667386912928, 0.504905545701818, 0.296220086916766,
0.240809811677774)), class = "data.frame", row.names = c(52L,
68L, 72L, 93L, 165L, 169L, 190L, 194L, 246L, 262L, 266L, 287L,
291L, 343L, 359L, 363L, 384L, 388L, 440L, 456L, 460L, 481L, 485L,
634L, 650L, 654L, 675L, 679L, 731L, 747L, 751L, 772L, 776L, 844L,
848L, 869L, 873L, 925L, 941L, 945L, 966L, 970L, 1022L, 1038L,
1042L, 1063L, 1067L, 1216L, 1232L, 1236L, 1313L, 1329L, 1333L,
1354L, 1358L, 1426L, 1451L, 1455L, 1507L, 1527L, 1717L, 1721L,
1742L, 1746L, 2186L, 2202L, 2206L, 2227L, 2231L, 2380L, 2396L,
2400L, 3075L, 3079L, 3294L, 3298L, 3350L, 3366L, 3370L, 3391L,
3395L, 3467L, 4223L, 4239L, 4243L, 4264L, 4268L, 4433L, 4437L,
4708L, 4805L, 4902L, 5193L, 5969L, 7909L, 8006L))
and this is the structure:
> str(df)
'data.frame': 96 obs. of 5 variables:
$ new_day : int -25 3 7 -7 3 7 -7 0 -25 3 ...
$ order : Factor w/ 14 levels "Alteromonadales",..: 8 8 8 8 8 8 8 8 11 11 ...
$ family : Factor w/ 17 levels "Burkholderiaceae",..: 13 13 13 13 12 12 12 12 15 15 ...
$ genus : Factor w/ 24 levels "Burkholderiaceae_unclassified",..: 16 16 16 16 7 7 7 7 3 3 ...
$ Abundance: num 0.758 0.728 0.422 0.186 7.366 ...
my data is about relative abundances of species over time, I removed rare species so it doesn't add up to 100 % anymore,
but that is fine, it is about 98 % per date. However, I get these weird free polygons and triangles which I recognize from incorrect grouping etc., but the group parameter did not change anything here. I also tried several position and stat arguments, which did not help. Maybe it is about the order of factors or something?
What I'm looking for is a stacked plot of the abundances of cumulated orders without empty spaces in between etc. Create proportional geom_area plot directly in ggplot2
# area plot combining species on order level
ggplot(df, aes(x = new_day, y = Abundance, fill = order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
I get fewer weird shapes when going to a more detailed hierarchical level (genus instead of order)
# area plot on genus level
ggplot(df, aes(x = new_day, y = Abundance, fill = genus)) +
geom_area(stat = "identity", position = "stack") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
but these are still more blank areas than there should be by the sum of abundances for a given time
# total abundance per day
sum(subset(df, new_day == -25)$Abundance)
[1] 98.03997
Any suggestions on how to fix this?
The problem is that you sometimes have several abundance values for one new_day, even with more detailed hierarchical levels.
This is what creates discontinuities in the area plot. You need to have only one unique value for each new_day. In my example below, I just take the first abundance value after grouping by new_day and order, but it is probably not relevant for what you want to show. (You may want to take the mean or attributes these values to other new_day points in between, whatever you need).
The remaining little gaps are caused by the missing abundance values, since as you said, it does not add up to 100%. This is not a big deal, but you can probably fix it by replacing the missing values by 0.
EDIT : Now doing the sum of abundance values as you mentioned, and removing the small remaining gaps by replacing missing values by 0.
library(tidyverse)
df %>%
# Sum abundance values, to only keep one per point
group_by(new_day, order) %>%
summarise(abundance=sum(Abundance)) %>%
ungroup() %>%
# Replace missing values by 0
spread(key=order, value=abundance) %>%
gather(key=order, value=abundance, -new_day) %>%
replace_na(list(abundance=0)) -> data
ggplot(data, aes(x = new_day, y = abundance, fill=order)) +
geom_area(stat = "identity") +
geom_vline(aes(xintercept = 0), linetype = "dashed", size = 1.2)
I have a long format dataframe of responses to a repeated question about puberty status vb_ asked approximately yearly at ages 9, 10, 11, 13, 14, 15, 16, and 17.
Each year participants were asked to rate their development from 1 to 5, with 1 being least developed and 5 being most developed.
I would like to use R's ifelse() to identify inconsistent responses i.e. those that report a stage at one year that is lower than any of the previous years.
Here is some fake example data for 20 people:
vb <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
19L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), age = c(9L, 10L,
11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L,
17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L,
14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L,
10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L,
16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L,
13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L,
15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L,
11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L,
17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L,
14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L,
10L, 11L, 13L, 14L, 15L, 16L, 17L, 9L, 10L, 11L, 13L, 14L, 15L,
16L, 17L), vb_ = c(1L, 1L, 1L, 3L, 4L, 4L, 4L, 5L, 2L, 2L, 3L,
4L, 5L, 5L, 5L, 5L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 2L, 1L, 3L,
4L, 4L, 5L, 5L, 5L, 2L, 2L, 1L, 3L, 4L, 3L, 4L, 4L, 1L, 1L, 1L,
3L, 4L, 4L, 5L, 5L, 1L, 1L, 2L, 4L, 4L, 4L, 5L, 5L, 2L, 2L, 2L,
4L, 5L, 4L, 4L, 5L, 2L, 2L, 1L, 4L, 5L, 5L, 5L, 5L, 1L, 2L, 3L,
4L, 5L, 5L, 4L, 5L, 1L, 1L, 1L, 4L, 4L, 5L, 5L, 5L, 1L, 1L, 1L,
1L, 4L, 4L, 4L, 4L, 1L, 1L, 3L, 4L, 4L, 4L, 5L, 5L, 1L, 1L, 1L,
4L, 4L, 5L, 5L, 5L, 2L, 2L, 2L, 2L, 4L, 4L, 5L, 5L, 2L, 3L, 3L,
4L, 4L, 5L, 5L, 5L, 1L, 1L, 2L, 2L, 4L, 5L, 5L, 5L, 1L, 1L, 1L,
3L, 3L, 4L, 5L, 5L, 1L, 1L, 1L, 2L, 4L, 4L, 4L, 4L, 1L, 1L, 1L,
2L, 4L, 4L, 4L, 4L)), class = "data.frame", row.names = c(NA,
-160L), .Names = c("id", "age", "vb_"))
If you insist on a ifelse approach you can do:
vb <- vb[order(vb$id, vb$age), ]
vb$inconsistent <- ifelse(vb$id == lag(vb$id),
ifelse(vb$vb_ < lag(vb$vb_), "inconsistent", ""),
"")
vb$inconsistent[1] <- ""
id age vb_ inconsistent
1 1 9 1
2 1 10 1
3 1 11 1
4 1 13 3
5 1 14 4
6 1 15 4
7 1 16 4
8 1 17 5
9 2 9 2
10 2 10 2
11 2 11 3
12 2 13 4
13 2 14 5
14 2 15 5
15 2 16 5
16 2 17 5
17 3 9 2
18 3 10 3
19 3 11 3
20 3 13 3
21 3 14 4
22 3 15 4
23 3 16 4
24 3 17 5
25 4 9 2
26 4 10 1 inconsistent
27 4 11 3
...
Or one approach with dplyr is:
library(dplyr)
vb %>%
group_by(id) %>%
arrange(id, age) %>%
mutate(vb_diff = vb_ - lag(vb_)) %>%
filter(vb_diff < 0)
# A tibble: 6 x 4
# Groups: id [5]
id age vb_ vb_diff
<int> <int> <int> <int>
1 4 10 1 -1
2 5 11 1 -1
3 5 15 3 -1
4 8 15 4 -1
5 9 11 1 -1
6 10 16 4 -1
Here you go.
vb <- vb[order(vb$id, vb$age),]
vb$decreasingdevelopment <- c(0, diff(vb$vb_))<0 #difference between this score and previous <0
vb$sameperson <- c(0, diff(vb$id))==0 #is this the same participant than previous
vb$inconsistency <- vb$decreasingdevelopment&vb$sameperson #ifelse(vb$devdiff&vb$sameperson, T, F)
which(vb$inconsistency)
#[1] 26 35 38 62 67 79
Note that the use of ifelse() is possible but not necessary.
PS: for completeness of the answer, you should always use the following :
vb$inconsistency_robust <- apply(vb, 1, function(x) length(which(vb$vb_>x["vb_"]&vb$age<x["age"]&vb$id==x["id"]))>0)
#x["decreasingdevelopment"]&x["sameperson"])
all.equal(which(vb$inconsistency_robust), which(vb$inconsistency))
#> which(vb$inconsistency_robust)
#[1] 26 35 38 62 63 67 79
#> which(vb$inconsistency)
#[1] 26 35 38 62 67 79
Note how the robust method spots all the occurences of inconsistencies while my more naive ifelse() method here only compares line to line.
I have a data set with 4 columns, 2 of which are numeric, 1 is categorical and 1 is the label. The label has 13 levels (A to M). I tried to use knncat package in R to do classification, but every time I ran the code, I got the following error message:
Error in `[<-.data.frame`(`*tmp*`, factor.vars, value = c("M", "J", "K", :
replacement has 45500 rows, data has 1
The following is the code I used:
data <- read.csv('mosaic_data2.csv', header = T)
num <- dim(data)[1]
library(sampling)
set.seed(1234)
train_index <- sample(seq(1,num,1), floor(num * 0.7), replace = F)
test_index <- setdiff(seq(1,num,1), train_index)
train_data <- data[train_index,]
test_data <- data[test_index,]
library(knncat)
model <- knncat(train_data, classcol = 2)
Could anyone please take a look at the code and advise how I could eliminate this bug? Thank you very much!
The output of dput(head(data,100)) is as follows:
structure(list(latitude = c(52.7326028, 52.74287543, 52.82107841,
52.82025363, 52.81980596, 52.81721897, 52.81274172, 52.81274172,
52.8089586, 52.81424219, 52.8089586, 52.74007929, 52.77394023,
52.73659034, 52.73672518, 52.73764626, 52.73753744, 52.73659034,
52.73815233, 52.73679388, 52.73890319, 52.71697237, 52.63730282,
52.62720385, 52.63730282, 52.63543017, 52.63768035, 52.63510366,
52.6346578, 52.6346578, 52.6346578, 52.63447454, 52.63576418,
52.63447454, 52.6346578, 52.63447454, 52.69820719, 52.69603926,
52.68246919, 52.54600173, 52.54210198, 52.60628983, 52.61003275,
52.60278236, 52.60239604, 52.60348688, 52.60239604, 52.60382146,
52.60315644, 52.86047938, 52.86576353, 52.86954228, 52.81039471,
52.82094872, 52.82395073, 52.82444705, 52.88098384, 52.88469208,
52.88469208, 52.84979201, 52.84720159, 52.84831759, 52.82435938,
52.82319493, 52.82168337, 52.8230402, 52.8230402, 52.82513486,
52.82472379, 52.82756385, 52.82475438, 52.82434902, 52.82166611,
52.823712, 52.82401481, 52.82483489, 52.82103704, 52.82060763,
52.8208682, 52.82211317, 52.81868547, 52.8198332, 52.82023595,
52.81989134, 52.8196971, 52.82051066, 52.82463338, 52.82539131,
52.82580625, 52.82509199, 52.83759415, 52.83946254, 52.83946254,
52.83891871, 52.83821538, 52.84757879, 52.84663773, 52.8449371,
52.84592185, 52.84331619), longitude = c(-6.892397941, -6.915346343,
-6.922554014, -6.924997835, -6.926099967, -6.883340697, -6.897757597,
-6.897757597, -6.895500952, -6.883129556, -6.895500952, -6.703781864,
-6.680851783, -6.771845364, -6.773301282, -6.772958488, -6.77484647,
-6.771845364, -6.773422218, -6.772164896, -6.770622695, -6.784187251,
-6.901922588, -6.905109015, -6.901922588, -6.976679508, -6.973114498,
-6.974753462, -6.947990431, -6.947990431, -6.947990431, -6.976921427,
-6.958295227, -6.976921427, -6.947990431, -6.976921427, -6.902010609,
-6.915233457, -6.871160885, -6.832461149, -6.862126342, -6.943925285,
-6.93813643, -6.925128034, -6.932247524, -6.93461305, -6.932247524,
-6.934657053, -6.929283954, -6.845259603, -6.861188287, -6.866476268,
-6.940851164, -6.939203401, -6.930506188, -6.933317462, -6.929441954,
-6.922589037, -6.922589037, -6.926037258, -6.929423169, -6.917829279,
-6.938211918, -6.940658091, -6.940651748, -6.940107883, -6.940107883,
-6.938704642, -6.939084526, -6.933331264, -6.937496468, -6.937678962,
-6.940276221, -6.94018054, -6.939876475, -6.938983181, -6.934235666,
-6.93387209, -6.933134226, -6.934193569, -6.934383596, -6.933832641,
-6.937454656, -6.933818238, -6.93443811, -6.936913947, -6.920030341,
-6.920400963, -6.92215006, -6.910771124, -6.901500591, -6.899018998,
-6.899018998, -6.903007684, -6.90119821, -6.91063672, -6.909935672,
-6.90240965, -6.900066763, -6.901411136), mosaic_group = structure(c(10L,
10L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 10L, 10L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 10L, 10L, 10L, 13L, 13L, 13L, 13L,
9L, 6L, 6L, 6L, 6L, 6L, 10L, 8L, 8L, 9L, 9L, 9L, 9L, 7L, 7L,
7L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 8L, 8L, 8L, 8L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 8L,
6L, 6L, 6L, 6L, 6L, 8L, 8L, 10L, 10L, 10L), .Label = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"), class = "factor"),
small_code = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 14L, 14L, 14L, 14L, 14L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 28L, 28L, 28L
)), .Names = c("latitude", "longitude", "mosaic_group", "small_code"
), row.names = c(NA, 100L), class = "data.frame")
The function knncat::knncat accepts the argument classcol which is defined as:
Column with classification in it. Default: 1.
You have a data set with structure:
latitude longitude mosaic_group small_code
1 52.73260 -6.892398 J 1
2 52.74288 -6.915346 J 1
3 52.82108 -6.922554 H 2
4 52.82025 -6.924998 H 2
5 52.81981 -6.926100 H 2
6 52.81722 -6.883341 G 3
Therefore your argument should be classcol = 3 (or 4) I am assuming, but we can see that it certainly shouldn't be classcol = 2.
I want x- axis from 1 to 20 and y-axis from 1 to 6.
My data:
structure(list(HEI.ID = structure(c(12L, 9L, 14L, 19L, 20L, 1L,
7L, 5L, 11L, 3L, 10L, 18L, 2L, 8L, 6L, 15L, 13L, 17L, 4L, 16L
), .Label = c("BF", "CC", "DC", "ER", "IM", "MC", "ME ",
"MM", "MO", "OC", "OM", "OP", "SB", "SD", "SH", "SL", "SN", "TH",
"UN", "WS"), class = "factor"), X2007 = c(18L, 14L, 15L, 20L,
12L, 6L, 17L, 2L, 4L, 11L, 16L, 1L, 9L, 8L, 13L, 4L, 10L, 6L,
3L, 19L), X2008 = c(20L, 9L, 16L, 18L, 8L, 17L, 15L, 6L, 3L,
14L, 19L, 1L, 2L, 12L, 5L, 13L, 11L, 7L, 4L, 10L), X2009 = c(20L,
13L, 17L, 8L, 4L, 9L, 19L, 12L, 2L, 11L, 16L, 1L, 2L, 7L, 6L,
18L, 5L, 15L, 9L, 14L), X2010 = c(20L, 13L, 16L, 13L, 7L, 15L,
19L, 8L, 3L, 9L, 18L, 1L, 5L, 11L, 12L, 6L, 10L, 4L, 2L, 17L),
X2011 = c(20L, 2L, 16L, 14L, 6L, 10L, 17L, 8L, 3L, 15L, 19L,
1L, 4L, 18L, 13L, 11L, 8L, 12L, 4L, 7L), X2012 = c(20L, 12L,
19L, 13L, 8L, 14L, 15L, 10L, 11L, 9L, 17L, 2L, 7L, 18L, 5L,
16L, 3L, 4L, 6L, 1L)), .Names = c("HEI.ID", "X2007", "X2008",
"X2009", "X2010", "X2011", "X2012"), row.names = c(NA, -20L), class = "data.frame")
I use the following commands to draw histograms:
par(mfrow = c(3,4))
for(i in 1:20){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlab = STOF[i,1],cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1)
}
But after using above commands, I get different number in x- axis and y-axis.
I don't understand what your plot would looks like. It's not clear from your question and data provided.
I've tried to plot it. Please comment if you think it's the way to go.
Considering dt is your data.frame
library(reshape)
dt <- melt(dt)
library(ggplot2)
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt) +
geom_bar(stat = 'identity')
or
ggplot(aes(x=HEI.ID, y = value, fill = variable), data = dt1) +
geom_bar(stat = 'identity') +
facet_grid(variable ~.)
You could use xlim and ylim parameters in the hist function and control the axes using
axis:
par(mfrow = c(3,4))
for(i in 1:12){
print(i)
hist(as.numeric(HEIrank11[i,-1]),nclass=12,,main='students/faculty',
xlim=c(0, 21), ylim=c(0,6), xaxt='n', yaxt='n')
axis(1, at=c(0, 10, 20))
axis(2, at=0:6)
}
Do you really want your y-axis to go from 1 to 6? This will cut off parts of the bars.
Also, you iterate over all 20 rows for a grid with 12 plots. The code above gives the following plot: