I have less experience in R and I need help tidying my plot as it looks messy. Also, my project is to find the best minimal route from Seoul to every city and back to Seoul. It is almost like Traveling Salesman Problem (TSP) but there are some cities needed to be visited more than once as it is the only way to reach certain cities. I don't know how to do and what packages to use.
This is my code for igraph plot
library(igraph)
g1 <- graph( c("Seoul","Incheon","Seoul","Goyang","Seoul","Seongnam","Seoul",
"Bucheon","Seoul","Uijeongbu","Seoul","Gimpo",
"Seoul","Gwangmyeong", "Seoul", "Hanam","Seoul", "Guri",
"Seoul","Gwacheon","Busan","Changwon","Busan","Gimhae",
"Busan","Jeju","Busan","Yangsan","Busan","Geoje",
"Incheon","Goyang","Incheon","Bucheon","Incheon","Siheung",
"Incheon","Jeju","Incheon","Gimpo","Daegu","Gumi",
"Daegu","Gyeongsan","Daegu","Yeongcheon","Daejeon",
"Cheongju","Daejeon","Nonsan","Daejeon","Gongju",
"Daejeon","Gyeryong","Gwangju","Naju","Suwon","Yongin",
"Suwon","Seongnam","Suwon","Hwaseong","Suwon","Ansan",
"Suwon","Gunpo","Suwon","Osan","Suwon","Uiwang",
"Ulsan","Yangsan","Ulsan","Gyeongju","Ulsan","Miryang",
"Yongin","Seongnam","Yongin","Hwaseong","Yongin","Pyeongtaek",
"Yongin","Gwangju-si","Yongin","Icheon","Yongin","Anseong",
"Yongin","Uiwang","Goyang","Gimpo","Goyang","Paju","Goyang",
"Yangju","Changwon","Gimhae","Changwon","Jinju","Changwon",
"Miryang","Seongnam","Gwangju-si","Seongnam","Hanam","Seongnam",
"Uiwang","Seongnam","Gwacheon","Hwaseong","Ansan","Hwaseong",
"Pyeongtaek","Hwaseong","Gunpo","Hwaseong","Osan","Cheongju",
"Cheonan","Cheongju","Sejong","Bucheon","Siheung","Bucheon",
"Gwangmyeong","Ansan","Anyang","Ansan","Siheung","Ansan",
"Gunpo","Namyangju","Uijeongbu","Namyangju","Chuncheon",
"Namyangju","Hanam","Namyangju","Guri","Cheonan","Pyeongtaek",
"Cheonan","Sejong","Cheonan","Asan","Cheonan","Anseong",
"Jeonju","Gimje","Gimhae","Yangsan","Gimhae","Miryang",
"Pyeongtaek","Asan","Pyeongtaek","Osan","Pyeongtaek","Anseong",
"Pyeongtaek","Dangjin","Anyang","Siheung","Anyang","Gwangmyeong",
"Anyang","Gunpo","Anyang","Gwacheon","Siheung","Gwangmyeong",
"Siheung","Gunpo","Pohang","Yeongcheon","Pohang","Gyeongju",
"Jeju","Gimpo","Jeju","Mokpo","Jeju","Seogwipo","Uijeongbu",
"Yangju","Uijeongbu","Pocheon","Paju","Yangju","Gumi","Gimcheon",
"Gumi","Sangju","Gwangju-si","Hanam","Gwangju-si","Icheon",
"Gwangju-si","Yeoju","Sejong","Gongju","Wonju","Chungju",
"Wonju","Jecheon","Wonju","Yeoju","Jinju","Sacheon", "Yangsan",
"Miryang","Asan","Gongju","Iksan","Gunsan","Iksan","Nonsan",
"Iksan","Gimje","Chuncheon","Pocheon","Gyeongsan","Yeongcheon",
"Gunpo","Uiwang","Suncheon","Yeosu","Suncheon","Gwangyang",
"Gunsan","Gimje","Gyeongju","Yeongcheon","Geoje","Tongyeong",
"Osan","Anseong","Yangju","Pocheon","Yangju","Dongducheon",
"Icheon","Anseong","Icheon","Yeoju","Mokpo","Naju","Chungju",
"Jecheon","Chungju","Yeoju","Chungju","Mungyeong","Gangneung",
"Donghae","Gangneung","Sokcho","Seosan","Dangjin","Andong",
"Yeongju","Pocheon","Dongducheon","Gimcheon","Sangju","Tongyeong",
"Sacheon","Nonsan","Gongju","Nonsan","Boryeong","Nonsan",
"Gyeryong","Gongju","Boryeong","Gongju","Gyeryong","Jeongeup",
"Gimje","Yeongju","Mungyeong","Yeongju","Taebaek","Sangju",
"Mungyeong","Sokcho","Samcheok","Samcheok","Taebaek",
"Suncheon","Gwangju"), directed=F)
E(g1)$distance <- c(27, 16, 20, 19, 20, 24, 14, 20, 15, 15, 36, 18, 299, 18, 53,
25, 8, 12, 440, 18, 36, 13, 33, 33, 31, 26, 15, 20, 13, 20,
19, 18, 13, 16, 10, 33, 36, 51, 24, 31, 28, 21, 23, 27, 22,
11, 12, 24, 18, 52, 27, 11, 13, 19, 13, 14, 34, 20, 23, 38,
18, 12, 9, 12, 7, 10, 19, 53, 11, 8, 20, 27, 11, 26, 24, 18,
33, 25, 18, 15, 44, 14, 12, 4, 5, 12, 12, 37, 21, 458, 146,
27, 10, 23, 24, 21, 36, 14, 23, 36, 21, 39, 33, 26, 20, 32,
40, 20, 29, 18, 47, 24, 4, 27, 19, 22, 29, 17, 24, 18, 13,
32, 18, 37, 28, 43, 51, 33, 56, 20, 28, 12, 30, 38, 29, 47,
17, 47, 22, 26, 46, 51, 20, 10, 36,63)
plot(g1, edge.label=E(g1)$distance,
vertex.label.cex=0.6, vertex.size=4)
igraph plot
Using trick from https://or.stackexchange.com/questions/5555/tsp-with-repeated-city-visits
library(data.table)
library(purrr)
library(TSP)
library(igraph)
We need to create distance matrix based on shortest paths for each pair of vertices:
vertex_names <- names(V(g1))
N <- length(vertex_names)
dt <- map(
head(seq_along(vertex_names), -1),
~data.table(
from = vertex_names[[.x]],
to = vertex_names[(.x+1):N],
path = map(
shortest_paths(g1, vertex_names[[.x]], vertex_names[(.x+1):N])[["vpath"]],
names
)
),
) %>%
rbindlist()
then we calculate distances of shortest paths:
m <- as_adjacency_matrix(g1, type = "both", attr = "distance", sparse = FALSE)
dt[, weight := map_dbl(path, ~sum(m[embed(.x, 2)[, 2:1, drop=FALSE]]))]
now we assemble new matrix:
dt <- rbind(
dt, dt[, .(from = to, to = from, path = map(path, rev), weight = weight)]
)
new_m <- matrix(0, N, N)
rownames(new_m) <- colnames(new_m) <- vertex_names
new_m[as.matrix(dt[, .(from,to)])] <- dt[["weight"]]
on this new matrix we use some heuristic to solve TSP (for exact solution you should use method="concorde"):
res <- new_m %>%
TSP() %>%
solve_TSP(repetitions = 1000, two_opt = TRUE)
now we exchange each pair of consecutive cities with shortest path:
start_city <- "Seoul"
path_dt <- c(start_city, labels(cut_tour(res, start_city)), start_city) %>%
embed(2) %>%
.[,2:1,drop = FALSE] %>%
"colnames<-"(c("from", "to")) %>%
as.data.table()
path_dt <- dt[path_dt, on = .(from ,to)]
my_path <- c(unlist(map(path_dt[["path"]], head, -1)), start_city)
my_path is heuristic solution with distance tour_length(res)
Related
I have the following data:
dput(example)
structure(list(q1 = c(5, 22, 16, 24, 9, 20, 21, 16, 28, 28, 24,
25, 34, 22, 29, NA, 24, 13, 10, 17, 24, 21, 22, 35, 20, 25, 25,
23, 22, 20, 27, 22, 20, 23, 5, 21, 19, 17, 27, 20, 35, 35, 10,
16, 22, 34, 34, 23, 25, 23, 25, 30, 18, 21, 15, 23, 5, 35, 5,
30), q2 = c(5, 5, 24, 15, 5, 5, 26, 23, 24, 9, 24, 5, 15, 26,
30, 14, 14, 19, 11, 25, 20, 5, 14, 13, 11, 10, 13, 16, 16, 21,
10, 12, 20, 9, 15, 5, 13, 5, 30, 18, 12, 27, 10, 9, 20, 5, 9,
10, 11, 26, 22, 8, 6, 5, 15, 6, 5, 35, 10, 18), q3 = c(11, 22,
NA, 22, 6, 18, 30, 6, 26, NA, 17, 22, 33, 19, 22, 25, 23, 13,
13, 15, 16, 16, 23, 24, 6, 25, 27, 12, 25, 17, 28, 15, 20, 31,
5, 17, 17, 20, 24, 7, 35, 35, 10, 10, 20, 10, 31, 21, 16, 32,
25, 30, 10, 24, 15, 24, 5, 35, 9, 26), q4 = c(14, 15, 23, 21,
NA, 25, 30, 23, 28, 20, 25, 5, 35, 30, 19, 23, 30, 5, 23, 18,
30, 15, 30, 22, 8, 29, 35, 23, 23, 24, 25, 25, 20, 25, 5, 15,
34, 8, 32, 35, 35, 35, 10, 6, 21, 10, 24, 27, 10, 30, 35, 15,
6, 21, 15, 15, 5, 35, 19, 26), q5 = c(5, 18, 21, 19, 5, 6, 5,
29, 20, 23, 22, 5, 16, 22, 12, 13, 18, 5, 17, 15, 18, 16, 20,
8, 12, 19, 12, 23, 9, 16, 5, 29, 20, 5, 5, 5, 5, 5, 30, 22, 32,
35, 10, 13, 20, 13, 12, 16, 5, 24, 22, 17, 5, 20, 14, 5, 5, 35,
15, 16), q6 = c(15, 9, 25, 26, 6, 17, 28, 32, 26, 28, 24, 25,
11, 24, 31, 18, 19, 6, 20, 26, 29, 17, 21, 24, 7, 29, 17, 17,
14, 25, 24, 35, 24, 6, 16, 6, 9, 6, 38, 19, 30, 42, 12, 20, 27,
26, 25, 13, 9, 36, 27, 27, 7, 24, 22, 6, 16, 42, 14, 11)), class = "data.frame", row.names = c(NA,
-60L))
I then use mice:
*edit: forgot the complete line
library(mice)
imp <- mice(example,m=5,maxit=50,meth='pmm',seed=500)
example_i <- complete(imp,1)
But when trying to get a densityplot I get the following error:
densityplot(imp)
Error in str2lang(x) : <text>:2:0: unexpected end of input
1: ~
^
My questions are:
Is there something fundamentally wrong about my approach to impute missing data? (this is just a small example)
Am I using properly the MICE arguments?
What am I doing wrong with the density plot, as I have gotten it for all of the other scales I am working with?
Answer
You need to supply a formula to densityplot, otherwise it will plot all variables with > 2 missing values. Since you don't have any variables with 2 > missing values, and since densityplot doesn't expect that, it produces this cryptic error.
Example that works
example$q4[1:10] <- NA
imp <- mice(example, m = 5, maxit = 50, meth = "pmm", seed = 500)
densityplot(imp)
# equivalent: densityplot(imp, ~ q4)
Rationale
imp is of class mids, so you are calling densityplot.mids. Normally, densityplot.mids requires you to provide a formula (data argument), so that it knows which variables to plot (see ?densityplot.mids). If you want to plot q4, then the code is densityplot(imp, ~ q4).
Inside densityplot.mids, we see:
if (missing(data)) {
vnames <- vnames[!allfactors & x$nmis > 2 & x$nmis <
nrow(x$data) - 1]
formula <- as.formula(paste("~", paste(vnames,
collapse = "+", sep = ""), sep = ""))
}
If we use traceback() right after getting your error, then you will see that the last line above is the line that throws the error.
In the first line, you can see the condition xnmis > 2, which means that it will grab all the columns with more than 2 missing values. When no columns satisfy the conditions, then vnames will evaluate to character(0), and so the subsequent line yields as output ~, i.e. the code that you see in your error.
So, why does it give an error when there are too few missings? That's because densityplot plots a distribution, and plotting a distribution of 1 or 2 points is just not doable.
Suggestion
The package maintainers could improve the error by simply checking whether vnames has any content, and if not, they can throw an error that is informative. You may want to add this as an issue on Github if you think it is useful.
I have created a simple minimum spanning tree and now have a data frame with columns 'from', 'to' and 'distance'.
Based on this, I found communities using the Louvain method, which I plotted. As far as I understand it, for clustering and plotting I need only the columns from and to, and the distance is not used.
How can I keep the communities I found, ideally each in a different color, but remove the box around the communities?
library(igraph)
from <- c(14, 25, 18, 19, 29, 23, 24, 36, 5, 22, 21, 29, 18, 26, 2, 45, 8, 7, 36, 42, 3, 23, 13, 13, 20, 15, 13, 7, 28, 9, 6, 37, 8, 4, 15, 27, 10, 2, 39, 1, 43, 21, 14, 4, 14, 8, 9, 40, 31, 1)
to <- c(16, 26, 27, 20, 32, 34, 35, 39, 6, 32, 35, 30, 22, 28, 45, 46, 48, 12, 38, 43, 42, 24, 27, 25, 30, 20, 50, 29, 34, 49, 40, 39, 11, 41, 46, 47, 50, 16, 46, 40, 44, 31, 17, 40, 44, 23, 33, 42, 33, 1)
distance <- c(0.3177487, 0.3908324, 0.4804059, 0.4914682, 0.5610357, 0.6061082, 0.6357532, 0.6638961, 0.7269725, 0.8136463, 0.8605391, 0.8665838, 0.8755252, 0.8908454, 0.9411793, 0.9850834, 1.0641603, 1.0721154, 1.0790506, 1.1410964, 1.1925349, 1.2115428, 1.2165045, 1.2359032, 1.2580204, 1.2725243, 1.2843610, 1.2906908, 1.3070725, 1.3397053, 1.3598817, 1.3690732, 1.3744088, 1.3972220, 1.4472312, 1.4574936, 1.4654772, 1.4689660, 1.5999424, 1.6014316, 1.6305410, 1.6450413, 1.6929959, 1.7597620, 1.8113320, 2.0380866, 3.0789517, 4.0105981, 5.1212614, 0.0000000)
mst <- cbind.data.frame(from, to, distance)
g <- graph.data.frame(mst[, 1:2], directed = FALSE)
lou <- cluster_louvain(g)
set.seed(1)
plot(lou, g, vertex.label = NA, vertex.size=5)
The blobs around the groups can be turned off like this:
plot(lou, g, vertex.label = NA, vertex.size=5, mark.groups = NULL)
Do you want this?
plot(lou, g, vertex.label = NA, vertex.size = 5, mark.border = NA)
While there are functions for saving data as a separate CSV file (write.table) or as an R-data file (save, saveRDS), I have not found a way to store or print a data frame as R code that recreates this data frame.
Background of my question is that I want to include data with a script (instead of storing it in a separate file), and am thus looking for a way to generate the specific code provided the data frame already exists. I could hack on with sed or other external tools, but I wonder whether someone knows of a built-in method in R.
Try with "dput" like so:
dput(cars)
# Returns:
structure(list(speed = c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16,
16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20,
22, 23, 24, 24, 24, 24, 25), dist = c(2, 10, 4, 22, 16, 10, 18,
26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80,
20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32,
48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)), class = "data.frame",
row.names = c(NA, -50L))
I'm having difficulty using the Tidyr function "complete()" to fill in columns for absent weeks. While the complete() function does work, it loops through the entire year 35 times and fills in 4,375 entries rather than just 125.
In short, when I try to use the complete function, it does not just complete the dataframe but duplicates all columns 35 times.
I have tried several different approaches including with and w/o the full_seq function.
1st approach:
Df %>%
group_by(week = week(`Local Start Time`)) %>%
complete(week = 1:52)
Second approach:
Df %>%
group_by(week = week(`Local Start Time`)) %>%
complete(week = full_seq(week <- c(1:52), 1L))
I expected the dataframe to stop at row 125 but instead the complete() loops over the entire yearly data (35 times!) and continues until column 4375.
Any advice is appreciated, thanks!
The data I used is here...
structure(list(`Local Start Time` = structure(c(1483846399, 1483846519,
1483851979, 1484734742, 1485017522, 1485190862, 1486236902, 1486238462,
1486347422, 1486448822, 1487221742, 1487392502, 1487449502, 1487678750,
1487679111, 1487679411, 1487683370, 1488321651, 1488745130, 1489353950,
1489710710, 1491043550, 1492036467, 1492105535, 1492150284, 1492180823,
1492772358, 1493428578, 1493440398, 1493465717, 1493476518, 1493484558,
1493495837, 1493622558, 1493639598, 1493718078, 1493718858, 1493720778,
1495021772, 1495598357, 1495599017, 1496175677, 1496428517, 1496439678,
1496494637, 1496632757, 1496887457, 1496887757, 1496888117, 1497031577,
1497207557, 1497318797, 1497368657, 1497491178, 1497558017, 1497857478,
1498220117, 1498245977, 1498246577, 1498255277, 1498257797, 1499203517,
1499470577, 1500752057, 1500899837, 1502036477, 1502392277, 1502410817,
1502428157, 1502429957, 1503492618, 1503500417, 1503507318, 1503672677,
1503674057, 1503674370, 1503675077, 1503923478, 1503928037, 1503932777,
1503943037, 1503972019, 1503989537, 1504383497, 1504421837, 1504639337,
1504656977, 1504672937, 1504682418, 1504722677, 1506766878, 1507180518,
1507184597, 1507228877, 1507229657, 1507370717, 1508326217, 1508343977,
1508357297, 1508374397, 1508492838, 1508555177, 1508560158, 1508868737,
1509231244, 1509252184, 1509845644, 1510709818), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), week = c(2, 2, 2, 3, 3, 4, 5, 5, 6,
6, 7, 7, 7, 8, 8, 8, 8, 9, 10, 11, 11, 13, 15, 15, 15, 15, 16,
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 20, 21, 21, 22, 22,
22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 25,
25, 25, 27, 27, 29, 30, 32, 32, 32, 32, 32, 34, 34, 34, 34, 34,
34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 39,
40, 40, 40, 40, 40, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45,
46)), class = "data.frame", row.names = c(NA, -108L), .Names = c("Local Start Time",
"week"))
How can I include array or data frame output in a message, warning or error?
By default, the output is collapsed by deparseing each column, which isn't useful. Here's an example, using the cars dataset.
message(cars)
## c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 22, 23, 24, 24, 24, 24, 25)c(2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)
Print the output, recapture it using capture.output(), and collapse into a single string separated by newlines.
print_and_capture <- function(x)
{
paste(capture.output(print(x)), collapse = "\n")
}
message(print_and_capture(cars))
## speed dist
## 1 4 2
## 2 4 10
## # etc.
stop("An error was found in the cars dataset:\n", print_and_capture(cars))
## Error: An error was found in the cars dataset:
## speed dist
## 1 4 2
## 2 4 10
## # etc.
print_and_capture() is now available in assertive.base.