Related
I have less experience in R and I need help tidying my plot as it looks messy. Also, my project is to find the best minimal route from Seoul to every city and back to Seoul. It is almost like Traveling Salesman Problem (TSP) but there are some cities needed to be visited more than once as it is the only way to reach certain cities. I don't know how to do and what packages to use.
This is my code for igraph plot
library(igraph)
g1 <- graph( c("Seoul","Incheon","Seoul","Goyang","Seoul","Seongnam","Seoul",
"Bucheon","Seoul","Uijeongbu","Seoul","Gimpo",
"Seoul","Gwangmyeong", "Seoul", "Hanam","Seoul", "Guri",
"Seoul","Gwacheon","Busan","Changwon","Busan","Gimhae",
"Busan","Jeju","Busan","Yangsan","Busan","Geoje",
"Incheon","Goyang","Incheon","Bucheon","Incheon","Siheung",
"Incheon","Jeju","Incheon","Gimpo","Daegu","Gumi",
"Daegu","Gyeongsan","Daegu","Yeongcheon","Daejeon",
"Cheongju","Daejeon","Nonsan","Daejeon","Gongju",
"Daejeon","Gyeryong","Gwangju","Naju","Suwon","Yongin",
"Suwon","Seongnam","Suwon","Hwaseong","Suwon","Ansan",
"Suwon","Gunpo","Suwon","Osan","Suwon","Uiwang",
"Ulsan","Yangsan","Ulsan","Gyeongju","Ulsan","Miryang",
"Yongin","Seongnam","Yongin","Hwaseong","Yongin","Pyeongtaek",
"Yongin","Gwangju-si","Yongin","Icheon","Yongin","Anseong",
"Yongin","Uiwang","Goyang","Gimpo","Goyang","Paju","Goyang",
"Yangju","Changwon","Gimhae","Changwon","Jinju","Changwon",
"Miryang","Seongnam","Gwangju-si","Seongnam","Hanam","Seongnam",
"Uiwang","Seongnam","Gwacheon","Hwaseong","Ansan","Hwaseong",
"Pyeongtaek","Hwaseong","Gunpo","Hwaseong","Osan","Cheongju",
"Cheonan","Cheongju","Sejong","Bucheon","Siheung","Bucheon",
"Gwangmyeong","Ansan","Anyang","Ansan","Siheung","Ansan",
"Gunpo","Namyangju","Uijeongbu","Namyangju","Chuncheon",
"Namyangju","Hanam","Namyangju","Guri","Cheonan","Pyeongtaek",
"Cheonan","Sejong","Cheonan","Asan","Cheonan","Anseong",
"Jeonju","Gimje","Gimhae","Yangsan","Gimhae","Miryang",
"Pyeongtaek","Asan","Pyeongtaek","Osan","Pyeongtaek","Anseong",
"Pyeongtaek","Dangjin","Anyang","Siheung","Anyang","Gwangmyeong",
"Anyang","Gunpo","Anyang","Gwacheon","Siheung","Gwangmyeong",
"Siheung","Gunpo","Pohang","Yeongcheon","Pohang","Gyeongju",
"Jeju","Gimpo","Jeju","Mokpo","Jeju","Seogwipo","Uijeongbu",
"Yangju","Uijeongbu","Pocheon","Paju","Yangju","Gumi","Gimcheon",
"Gumi","Sangju","Gwangju-si","Hanam","Gwangju-si","Icheon",
"Gwangju-si","Yeoju","Sejong","Gongju","Wonju","Chungju",
"Wonju","Jecheon","Wonju","Yeoju","Jinju","Sacheon", "Yangsan",
"Miryang","Asan","Gongju","Iksan","Gunsan","Iksan","Nonsan",
"Iksan","Gimje","Chuncheon","Pocheon","Gyeongsan","Yeongcheon",
"Gunpo","Uiwang","Suncheon","Yeosu","Suncheon","Gwangyang",
"Gunsan","Gimje","Gyeongju","Yeongcheon","Geoje","Tongyeong",
"Osan","Anseong","Yangju","Pocheon","Yangju","Dongducheon",
"Icheon","Anseong","Icheon","Yeoju","Mokpo","Naju","Chungju",
"Jecheon","Chungju","Yeoju","Chungju","Mungyeong","Gangneung",
"Donghae","Gangneung","Sokcho","Seosan","Dangjin","Andong",
"Yeongju","Pocheon","Dongducheon","Gimcheon","Sangju","Tongyeong",
"Sacheon","Nonsan","Gongju","Nonsan","Boryeong","Nonsan",
"Gyeryong","Gongju","Boryeong","Gongju","Gyeryong","Jeongeup",
"Gimje","Yeongju","Mungyeong","Yeongju","Taebaek","Sangju",
"Mungyeong","Sokcho","Samcheok","Samcheok","Taebaek",
"Suncheon","Gwangju"), directed=F)
E(g1)$distance <- c(27, 16, 20, 19, 20, 24, 14, 20, 15, 15, 36, 18, 299, 18, 53,
25, 8, 12, 440, 18, 36, 13, 33, 33, 31, 26, 15, 20, 13, 20,
19, 18, 13, 16, 10, 33, 36, 51, 24, 31, 28, 21, 23, 27, 22,
11, 12, 24, 18, 52, 27, 11, 13, 19, 13, 14, 34, 20, 23, 38,
18, 12, 9, 12, 7, 10, 19, 53, 11, 8, 20, 27, 11, 26, 24, 18,
33, 25, 18, 15, 44, 14, 12, 4, 5, 12, 12, 37, 21, 458, 146,
27, 10, 23, 24, 21, 36, 14, 23, 36, 21, 39, 33, 26, 20, 32,
40, 20, 29, 18, 47, 24, 4, 27, 19, 22, 29, 17, 24, 18, 13,
32, 18, 37, 28, 43, 51, 33, 56, 20, 28, 12, 30, 38, 29, 47,
17, 47, 22, 26, 46, 51, 20, 10, 36,63)
plot(g1, edge.label=E(g1)$distance,
vertex.label.cex=0.6, vertex.size=4)
igraph plot
Using trick from https://or.stackexchange.com/questions/5555/tsp-with-repeated-city-visits
library(data.table)
library(purrr)
library(TSP)
library(igraph)
We need to create distance matrix based on shortest paths for each pair of vertices:
vertex_names <- names(V(g1))
N <- length(vertex_names)
dt <- map(
head(seq_along(vertex_names), -1),
~data.table(
from = vertex_names[[.x]],
to = vertex_names[(.x+1):N],
path = map(
shortest_paths(g1, vertex_names[[.x]], vertex_names[(.x+1):N])[["vpath"]],
names
)
),
) %>%
rbindlist()
then we calculate distances of shortest paths:
m <- as_adjacency_matrix(g1, type = "both", attr = "distance", sparse = FALSE)
dt[, weight := map_dbl(path, ~sum(m[embed(.x, 2)[, 2:1, drop=FALSE]]))]
now we assemble new matrix:
dt <- rbind(
dt, dt[, .(from = to, to = from, path = map(path, rev), weight = weight)]
)
new_m <- matrix(0, N, N)
rownames(new_m) <- colnames(new_m) <- vertex_names
new_m[as.matrix(dt[, .(from,to)])] <- dt[["weight"]]
on this new matrix we use some heuristic to solve TSP (for exact solution you should use method="concorde"):
res <- new_m %>%
TSP() %>%
solve_TSP(repetitions = 1000, two_opt = TRUE)
now we exchange each pair of consecutive cities with shortest path:
start_city <- "Seoul"
path_dt <- c(start_city, labels(cut_tour(res, start_city)), start_city) %>%
embed(2) %>%
.[,2:1,drop = FALSE] %>%
"colnames<-"(c("from", "to")) %>%
as.data.table()
path_dt <- dt[path_dt, on = .(from ,to)]
my_path <- c(unlist(map(path_dt[["path"]], head, -1)), start_city)
my_path is heuristic solution with distance tour_length(res)
I created a graph G and I have a node view as following < 0, 1,2,... 100>
I randomly removed 20 nodes and the node view of this new graph misses the nodes I removed randomly. to be precise for example , in the new graph there are some nodes missing(since they are removed
node view <0,1,3,5,6,7,9 ...100>
however, I want this graph to be a new graph having node view such as the following:
<0,1,2....80>
is there any solution? I tried relabeling, coping the same graph, they didn't work
PS. my nodes have attribute label equal to either 0,1
and i want to preserve them
Here is one approach you can take. After removing your nodes from the graph you can relabel the remaining nodes using nx.relabel_nodes to get the node view you want. See example below:
import networkx as nx
import numpy as np
#Creating random graph
N_nodes=50
G=nx.erdos_renyi_graph(N_nodes,p=0.25)
#Removing random nodes
N_del_nodes=10
del_node_list=np.random.choice(N_nodes,size=N_del_nodes,replace=False)
G.remove_nodes_from(del_node_list)
print('Node view without relabelling:' +str(G.nodes))
#Relabelling graph
label_mapping={list(G.nodes)[j]:j for j in range(N_nodes-N_del_nodes)}
G_rel=nx.relabel_nodes(G, label_mapping)
print('Node view with relabelling:' +str(G_rel.nodes))
And the output gives:
Node view without relabelling:[0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 36, 37, 38, 40, 41, 44, 45, 46, 47, 48, 49]
Node view with relabelling:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
I've got a loop in my code that I would like to rewrite so running the code takes a little less time to compete. I know you allways have to avoid loops in the code but I can't think of an another way to accomplice my goal.
So I've got a dataset "df_1531" containing a lot of data that I need to cut into pieces by using subset() (if anyone knows a better way, let me know ;) ). I've got a vector with 21 variable names on which I like assign a subset of df_1531. Furthermore the script contains 22 variables with constrains (shift_XY_time).
So, this is my code now...
# list containing different slots
shift_time_list<- c(startdate, shift_1m_time, shift_1a_time, shift_1n_time,
shift_2m_time, shift_2a_time, shift_2n_time,
shift_3m_time, shift_3a_time, shift_3n_time,
shift_4m_time, shift_4a_time, shift_4n_time,
shift_5m_time, shift_5a_time, shift_5n_time,
shift_6m_time, shift_6a_time, shift_6n_time,
shift_7m_time, shift_7a_time, shift_7n_time)
# List with subset names
shift_sub_list <- c("shift_1m_sub", "shift_1a_sub", "shift_1n_sub",
"shift_2m_sub", "shift_2a_sub", "shift_2n_sub",
"shift_3m_sub", "shift_3a_sub", "shift_3n_sub",
"shift_4m_sub", "shift_4a_sub", "shift_4n_sub",
"shift_5m_sub", "shift_5a_sub", "shift_5n_sub",
"shift_6m_sub", "shift_6a_sub", "shift_6n_sub",
"shift_7m_sub", "shift_7a_sub", "shift_7n_sub")
# The actual loop that I'd like to rewrite
for (i in 1:21) {
assign(shift_sub_list[i], subset(df_1531, df_1531$'PLS FFM' >= shift_time_list[i] & df_1531$'PLS FFM' < shift_time_list[i+1]))
}
Running the loop takes approximately 6 or 7 seconds. So, if anyone knows a better/cleaner or quicker way to write my code, I desperately like to hear your suggestion/opinion.
**Reproducible example **
mydata <- cars
dput(cars)
structure(list(speed = c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16,
16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20,
22, 23, 24, 24, 24, 24, 25), dist = c(2, 10, 4, 22, 16, 10, 18,
26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80,
20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32,
48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)), class = "data.frame", row.names = c(NA,
-50L))
dist_interval_list <- c( 0, 5, 10, 15,
20, 25, 30, 35,
40, 45, 50, 55,
60, 65, 70, 75,
80, 85, 90, 95,
100, 105, 110, 115, 120)
var_name_list <- c("var_name_1a", "var_name_1b", "var_name_1c", "var_name_1d",
"var_name_2a", "var_name_2b", "var_name_2c", "var_name_2d",
"var_name_3a", "var_name_3b", "var_name_3c", "var_name_3d",
"var_name_4a", "var_name_4b", "var_name_4c", "var_name_4d",
"var_name_5a", "var_name_5b", "var_name_5c", "var_name_5d",
"var_name_6a", "var_name_6b", "var_name_6c", "var_name_6d")
for (i in 1:24){
assign(var_name_list[i], subset(mydata,
mydata$dist >= dist_interval_list[i] &
mydata$dist < dist_interval_list[i+1]))
}
Starting with the 'reproducible' part and the information that the final aim is to summarize another column, it is possible to exploit the fact that the intervals are non-overlapping and simply use the cut function.
library(tidyverse)
mydata %>%
mutate(interval = cut(dist, breaks = dist_interval_list)) %>%
group_by(interval) %>%
summarise(sum = sum(speed))
This should be much faster and will also help you not to get lost in a messy environment full of variables (which are actually part of your data). You want to keep all your data in a single data frame as long as possible;) You probably want to follow with something like purrrlyr::invoke_rows at the final modeling step, if your function does not work with data frames.
I use diagrammeR library in R to create and render binary trees. I find it very simple to use and it creates high quality renders. However creating a tree that is not fully (perfect) generates messy renders.
Here is what I get when my tree has 16 leaves (h = 4):
Fully binary tree
To be clear, every node label is the row name of nodes data.frame which indicates the sequence of nodes passed to the graph:
nodes$label = rownames(nodes)
And here is what I get if I add one node [32] from node [31] - either manually or by add_node() and add_edge() functions:
Non-perfect binary tree
As you can see, everything goes messy. I would like to have node [32] directly under node [31] with edge of straight vertical line. Is it even possible with this library? I can't figure out the proper order of nodes in nodes data.frame.
Here is how my full code looks like:
library(DiagrammeR)
from = c(1, 1, 2, 2, 3, 3, 4, 4, 7, 7, 10, 10, 11, 11, 14, 14, 17, 17, 18, 18, 19, 19, 22, 22, 25, 25, 26, 26, 29, 29)
to = c(2, 17, 3, 10, 4, 7, 5, 6, 8, 9, 11, 14, 12, 13, 15, 16, 18, 25, 19, 22, 20, 21, 23, 24, 26, 29, 27, 28, 30, 31)
h=4
n = 2^(h+1)-1
edges = data.frame(from, to)
nodes = data.frame(id = 1:n, label=1:n, shape='circle')
g1 = create_graph(nodes, edges)
render_graph(g1, layout='tree', title='g1')
# add node [32] and edge [31-32]
edges2 = rbind(edges, c(31, 32))
nodes2 = nodes
nodes2[32, 1:2] = 32
nodes2[32, 3] = 'circle'
g2 = create_graph(nodes2, edges2)
render_graph(g2, layout='tree', title='g2')
I have plotted a scatter plot with the point size scaled by frequency:
g<-ggplot(d, aes(x=Treatment, y= Seam.Cell.Number, size=Frequency))+geom_point(aes(colour=Strain))+ scale_size_continuous(range = c(3, 10), breaks=c(0,1, 2, 3, 4, 5,6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50))+guides(size=FALSE)
Now I am trying to plot means with standard error bars on top. I have the mean and standard error already calculated in columns in my csv file. So so far I have attempted:
g+geom_point(aes(x=Treatment,y=Mean))+geom_errorbar(aes(ymin=Mean-Standard.Error, ymax=Mean+Standard.Error, width=.4))+theme(axis.text.x = element_blank())+theme(legend.key = element_rect(colour = "black"))
And:
g+layer(data=d, mapping=aes(x=Treatment,y=Mean), geom="point")+geom_errorbar(aes(ymin=Mean-Standard.Error, ymax=Mean+Standard.Error), width=.4)+ylab("Seam Cell Number")
But they both give me very fat error bars/data points. It seems they are being affected by my size scaling in object g. I have tried to modify the size and width of the error bars, and I have tried to modify the size of the data points, both in these last bits of code, but to no avail. Is there a way to 'cancel' the size command for this layer?
If you reverse the order of your ggplot, you may be able to avoid the size distortion on the error bars.
Not having reproducible data, I made some up.
df <- data.frame(Treatment = (1:100), Seam.Cell.Number = 3:102, Frequency = 5:104,
Strain = rep(c("A", "B", "C", "D"), 25))
std <- function(x) sd(x)/sqrt(length(x))
Mean <- mean(df$Treatment)
df$Standard.Error <- std(df$Treatment)
g <- ggplot(df, aes(x = Treatment, y = Seam.Cell.Number)) +
geom_point(aes(x=Treatment, y=Mean)) +
geom_errorbar(aes(ymin=Mean-df$Standard.Error, ymax=Mean+df$Standard.Error, width=.4))+
theme(axis.text.x = element_blank())+
theme(legend.key = element_rect(colour = "black"))
g + geom_point(aes(colour=Strain)) +
scale_size_continuous(range = c(3, 10), breaks=c(0,1, 2, 3, 4, 5,6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42,
43, 44, 45, 46, 47, 48, 49, 50)) +
guides(size=FALSE)