I have a data like this
df<- structure(list(sname = structure(2:1, .Label = c("Carrot", "Melon"
), class = "factor"), sence = structure(1:2, .Label = c("RSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"TGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Lets look at the first row
Melon RSNSNASSAVSTSCVSNRAMKGTTHYDTS
I want to be able to chop the strings into different windows as well as moving in different pattern. for example lets say moving 1 letter at the time and windows of 10. so The first output will be like this
RSNSNASSAV
So this one is letter 1 ,2,3,4,5,6,7,8,9,10
The second one will be moving 1 letter forward and then chop for 10 letters
SNSNASSAVS
so this is letter 2,3,4,5,6,7,8,9,10,11
it goes until the end.
a requested output is like the following
output<- structure(list(position = structure(c(33L, 1L, 12L, 23L, 26L,
27L, 28L, 29L, 30L, 31L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 32L, 1L, 12L, 23L, 26L, 27L, 28L, 29L, 30L, 31L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 24L, 25L), .Label = c("1,2,3,4,5,6,7,8,9,10",
"10,11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,20",
"12,13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"16,17,18,19,20,21,22,23,24,25", "17,18,19,20,21,22,23,24,25,26",
"18,19,20,21,22,23,24,25,26,27", "19,20,21,22,23,24,25,26,27,28",
"2,3,4,5,6,7,8,9,10,11", "20,21,22,23,24,25,26,27,28,29", "21,22,23,24,25,26,27,28,29,30",
"22,23,24,25,26,27,28,29,30,31", "23,24,25,26,27,28,29,30,31,32",
"24,25,26,27,28,29,30,31,32,33", "25,26,27,28,29,30,31,32,33,34",
"26,27,28,29,30,31,32,33,34,35", "27,28,29,30,31,32,33,34,35,36",
"28,29,30,31,32,33,34,35,36,37", "29,30,31,32,33,34,35,36,37,38",
"3,4,5,6,7,8,9,10,11,12", "30,31,32,33,34,35,36,37,38,39", "31,32,33,34,35,36,37,38,39,40",
"4,5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,14,15",
"7,8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17",
"9,10,11,12,13,14,15,16,17,18", "Carrot", "Melon"), class = "factor"),
name = structure(c(20L, 32L, 37L, 26L, 35L, 35L, 2L, 38L,
33L, 3L, 46L, 39L, 42L, 34L, 7L, 45L, 36L, 24L, 27L, 1L,
21L, 5L, 41L, 15L, 22L, 28L, 17L, 14L, 16L, 23L, 47L, 40L,
43L, 6L, 19L, 8L, 19L, 8L, 48L, 44L, 10L, 12L, 25L, 31L,
30L, 29L, 18L, 50L, 13L, 4L, 49L, 9L, 11L), .Label = c("AMKGTTHYDT",
"ASSAVSTSCV", "AVSTSCVSNR", "AYDDHHRGGV", "Carrot", "CMCVVDDNRR",
"CVSNRAMKGT", "CVVDDNRRRH", "DDHHRGGVCT", "DDNRRRHYNG", "DHHRGGVCTS",
"DNRRRHYNGA", "GAYDDHHRGG", "GGMVSVCMCV", "GMRHGGMVSV", "GMVSVCMCVV",
"HGGMVSVCMC", "HYNGAYDDHH", "MCVVDDNRRR", "Melon", "MKGTTHYDTS",
"MRHGGMVSVC", "MVSVCMCVVD", "NRAMKGTTHY", "NRRRHYNGAY", "NSNASSAVST",
"RAMKGTTHYD", "RHGGMVSVCM", "RHYNGAYDDH", "RRHYNGAYDD", "RRRHYNGAYD",
"RSNSNASSAV", "SAVSTSCVSN", "SCVSNRAMKG", "SNASSAVSTS", "SNRAMKGTTH",
"SNSNASSAVS", "SSAVSTSCVS", "STSCVSNRAM", "SVCMCVVDDN", "TGMRHGGMVS",
"TSCVSNRAMK", "VCMCVVDDNR", "VDDNRRRHYN", "VSNRAMKGTT", "VSTSCVSNRA",
"VSVCMCVVDD", "VVDDNRRRHY", "YDDHHRGGVC", "YNGAYDDHHR"), class = "factor")), class = "data.frame", row.names = c(NA,
-53L))
Split with 2
RSNSNASSAV
NSNASSAVST
NASSAVSTSC
SSAVSTSCVS
AVSTSCVSNR
STSCVSNRAM
SCVSNRAMKG
VSNRAMKGTT
NRAMKGTTHY
AMKGTTHYDT
KGTTHYDTS
We convert the factor columns to character, then transmute to createa tibble of 'position', 'name' by looping over the rows with map, create substrings based on the the split width 'n' and the number of character (nchar) of 'sence', concatenate the 'sname' as the first element and unnest the list output to create a two column dataset
library(tidyverse)
f1 <- function(dat, n, mv = 1) {
dat %>%
mutate_all(as.character) %>%
transmute(out = map2(sence, sname, ~ {
i1 <- seq_len(nchar(.x) - (n -1))
i11 <- seq(i1[1], i1[length(i1)], by = mv)
i2 <- n:nchar(.x)
i22 <- seq(i2[1], i2[length(i2)], by = mv)
tibble(position = c(.y, map2_chr(i11, i22, ~
str_c(seq(.x, .y), collapse=","))),
name = c(.y, substring(.x, i11, i22)))
})) %>%
unnest
}
-testing
- moving window - 1
f1(df, n = 10, mv = 1)
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 2,3,4,5,6,7,8,9,10,11 SNSNASSAVS
#4 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#5 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#6 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#7 6,7,8,9,10,11,12,13,14,15 ASSAVSTSCV
#8 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#9 8,9,10,11,12,13,14,15,16,17 SAVSTSCVSN
#10 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#11 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#12 11,12,13,14,15,16,17,18,19,20 STSCVSNRAM
#13 12,13,14,15,16,17,18,19,20,21 TSCVSNRAMK
#14 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#15 14,15,16,17,18,19,20,21,22,23 CVSNRAMKGT
#16 15,16,17,18,19,20,21,22,23,24 VSNRAMKGTT
#17 16,17,18,19,20,21,22,23,24,25 SNRAMKGTTH
#18 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
#19 18,19,20,21,22,23,24,25,26,27 RAMKGTTHYD
#20 19,20,21,22,23,24,25,26,27,28 AMKGTTHYDT
#21 20,21,22,23,24,25,26,27,28,29 MKGTTHYDTS
#22 Carrot Carrot
#23 1,2,3,4,5,6,7,8,9,10 TGMRHGGMVS
#24 2,3,4,5,6,7,8,9,10,11 GMRHGGMVSV
#25 3,4,5,6,7,8,9,10,11,12 MRHGGMVSVC
#26 4,5,6,7,8,9,10,11,12,13 RHGGMVSVCM
#27 5,6,7,8,9,10,11,12,13,14 HGGMVSVCMC
#28 6,7,8,9,10,11,12,13,14,15 GGMVSVCMCV
#29 7,8,9,10,11,12,13,14,15,16 GMVSVCMCVV
#30 8,9,10,11,12,13,14,15,16,17 MVSVCMCVVD
#31 9,10,11,12,13,14,15,16,17,18 VSVCMCVVDD
#32 10,11,12,13,14,15,16,17,18,19 SVCMCVVDDN
#33 11,12,13,14,15,16,17,18,19,20 VCMCVVDDNR
#34 12,13,14,15,16,17,18,19,20,21 CMCVVDDNRR
#35 13,14,15,16,17,18,19,20,21,22 MCVVDDNRRR
#36 14,15,16,17,18,19,20,21,22,23 CVVDDNRRRH
#37 15,16,17,18,19,20,21,22,23,24 VVDDNRRRHY
#38 16,17,18,19,20,21,22,23,24,25 VDDNRRRHYN
#39 17,18,19,20,21,22,23,24,25,26 DDNRRRHYNG
#40 18,19,20,21,22,23,24,25,26,27 DNRRRHYNGA
#41 19,20,21,22,23,24,25,26,27,28 NRRRHYNGAY
#42 20,21,22,23,24,25,26,27,28,29 RRRHYNGAYD
#43 21,22,23,24,25,26,27,28,29,30 RRHYNGAYDD
#44 22,23,24,25,26,27,28,29,30,31 RHYNGAYDDH
#45 23,24,25,26,27,28,29,30,31,32 HYNGAYDDHH
#46 24,25,26,27,28,29,30,31,32,33 YNGAYDDHHR
#47 25,26,27,28,29,30,31,32,33,34 NGAYDDHHRG
#48 26,27,28,29,30,31,32,33,34,35 GAYDDHHRGG
#49 27,28,29,30,31,32,33,34,35,36 AYDDHHRGGV
#50 28,29,30,31,32,33,34,35,36,37 YDDHHRGGVC
#51 29,30,31,32,33,34,35,36,37,38 DDHHRGGVCT
#52 30,31,32,33,34,35,36,37,38,39 DHHRGGVCTS
-moving window - 2
f1(df, n = 10, mv = 2) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#4 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#5 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#6 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
-moving window - 3
f1(df, n = 10, mv = 3) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#4 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#5 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#6 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
-moving window - 4
f1(df, n = 10, mv = 4) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#4 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#5 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#6 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
library('tidyverse')
# use this function to make the blocks:
make_substrings = function(string, len, label){
# set up the indices
str_len = nchar(string)
indices1 = 1:(str_len-len+1)
indices2 = (len:str_len)
# create the list of indices
position = map2_chr(indices1, indices2, .f = function(x, y){paste(x:y, collapse = ', ')})
# take substrings
name = map2_chr(indices1, indices2, .f = substr, x = string)
# add yoru food labels
header = tibble(position = label,
name = label)
header %>%
bind_rows(tibble(position,
name))
}
# your version had factors
df = df %>%
mutate_all(as.character)
# iterate over all the rows of df:
output = Map(f = make_substrings, string = df$sence, len = 10, label = df$sname) %>%
bind_rows
I got a list of nodes, and I need to randomly assign 'p' hubs to 'n' clients.
I got the following data, where the first row shows:
The total number of nodes.
The requested number of hubs.
The total supply capacity for each hub.
The following lines show:
The first column the node number.
The second column the "x" coordinate.
The third the "y" coordinate.
Below I will show the raw data, adding colnames() it would look something like this:
total_nodes hubs_required total_capacity
50 5 120
node number x_coordinate y_coordinate node_demand
1 2 62 3
2 80 25 14
3 36 88 1
4 57 23 14
. . . .
. . . .
. . . .
50 1 58 2
The x and y values are provided so we can calculate the Euclidean distance.
nodes:
50 5 120
1 2 62 3
2 80 25 14
3 36 88 1
4 57 23 14
5 33 17 19
6 76 43 2
7 77 85 14
8 94 6 6
9 89 11 7
10 59 72 6
11 39 82 10
12 87 24 18
13 44 76 3
14 2 83 6
15 19 43 20
16 5 27 4
17 58 72 14
18 14 50 11
19 43 18 19
20 87 7 15
21 11 56 15
22 31 16 4
23 51 94 13
24 55 13 13
25 84 57 5
26 12 2 16
27 53 33 3
28 53 10 7
29 33 32 14
30 69 67 17
31 43 5 3
32 10 75 3
33 8 26 12
34 3 1 14
35 96 22 20
36 6 48 13
37 59 22 10
38 66 69 9
39 22 50 6
40 75 21 18
41 4 81 7
42 41 97 20
43 92 34 9
44 12 64 1
45 60 84 8
46 35 100 5
47 38 2 1
48 9 9 7
49 54 59 9
50 1 58 2
I extracted the information from the first line.
nodes <- as.matrix(read.table(data))
header<-colnames(nodes)
clean_header <-gsub('X','',header)
requested_hubs <- as.numeric(clean_header[2])
max_supply_capacity <- as.numeric(clean_header[3])
I need to randomly select 5 nodes, that will act as hubs
set.seed(37)
node_to_hub <-nodes[sample(nrow(nodes),requested_hubs,replace = FALSE),]
Then randomly I need to assign nodes to each hub calculate the distances between the hub and each one of the nodes and when the max_supply_capacity(120) is exceeded select the following hub and repeat the process.
After the final iteration I need to return the cumulative sum of distances for all the hubs.
I need to repeat this process 100 times and return the min() value of the cumulative sum of distances.
This is where I'm completely stuck since I'm not sure how to loop through a matrix let alone when I have to select elements randomly.
I got the following elements:
capacity <- c(numeric()) # needs to be <= to 120
distance_sum <- c(numeric())
global_hub_distance <- c(numeric())
The formula for the euclidean distance (rounded) would be as below but I'm not sure how I can reflect the random selection when assigning nodes.
distance <-round(sqrt(((node_to_hub[i,2]-nodes[i,2]))^2+(node_to_hub[random,3]-nodes[random,3])^2))
The idea for the loop I think I need is below, but as I mentioned before I don't know how to deal with the sample client selection, and the distance calculation of the random clients.
for(i in 1:100){
node_to_hub
for(i in 1:nrow(node_to_hub){
#Should I randomly sample the clients here???
while(capacity < 120){
node_demand <- nodes[**random**,3]
distance <-round(sqrt(((node_to_hub[i,2]-nodes[i,2]))^2+(node_to_hub[**random**,3]-nodes[**random**,3])^2))
capacity <-c(capacity, node_demand)
distance_sum <- c(distance_sum,distance)
}
global_hub_distance <- c(global_hub_distance,distance_sum)
capacity <- 0
distance_sum <- 0
}
min(global_hub_distance)
}
Not EXACTLY sure what you are looking for but this code may be able to help you. It's not extremely fast, as instead of using a while to stop after hitting your total_capacity it just does a cumsum on the full node list and find the place where you exceed 120.
nodes <- structure(list(node_number = 1:50,
x = c(2L, 80L, 36L, 57L, 33L, 76L, 77L, 94L,
89L, 59L, 39L, 87L, 44L, 2L, 19L, 5L,
58L, 14L, 43L, 87L, 11L, 31L, 51L, 55L,
84L, 12L, 53L, 53L, 33L, 69L, 43L, 10L,
8L, 3L, 96L, 6L, 59L, 66L, 22L, 75L, 4L,
41L, 92L, 12L, 60L, 35L, 38L, 9L, 54L, 1L),
y = c(62L, 25L, 88L, 23L, 17L, 43L, 85L, 6L, 11L,
72L, 82L, 24L, 76L, 83L, 43L, 27L, 72L, 50L,
18L, 7L, 56L, 16L, 94L, 13L, 57L, 2L, 33L, 10L,
32L, 67L, 5L, 75L, 26L, 1L, 22L, 48L, 22L, 69L,
50L, 21L, 81L, 97L, 34L, 64L, 84L, 100L, 2L, 9L, 59L, 58L),
node_demand = c(3L, 14L, 1L, 14L, 19L, 2L, 14L, 6L,
7L, 6L, 10L, 18L, 3L, 6L, 20L, 4L,
14L, 11L, 19L, 15L, 15L, 4L, 13L,
13L, 5L, 16L, 3L, 7L, 14L, 17L,
3L, 3L, 12L, 14L, 20L, 13L, 10L,
9L, 6L, 18L, 7L, 20L, 9L, 1L, 8L,
5L, 1L, 7L, 9L, 2L)),
.Names = c("node_number", "x", "y", "node_demand"),
class = "data.frame", row.names = c(NA, -50L))
total_nodes = nrow(nodes)
hubs_required = 5
total_capacity = 120
iterations <- 100
track_sums <- matrix(NA, nrow = iterations, ncol = hubs_required)
colnames(track_sums) <- paste0("demand_at_hub",1:hubs_required)
And then I prefer using a function for distance, in this case A and B are 2 separate vectors with c(x,y) and c(x,y).
euc.dist <- function(A, B) round(sqrt(sum((A - B) ^ 2))) # distances
The Loop:
for(i in 1:iterations){
# random hub selection
hubs <- nodes[sample(1:total_nodes, hubs_required, replace = FALSE),]
for(h in 1:hubs_required){
# sample the nodes into a random order
random_nodes <- nodes[sample(1:nrow(nodes), size = nrow(nodes), replace = FALSE),]
# cumulative sum their demand, and get which number passes 120,
# and subtract 1 to get the node before that
last <- which(cumsum(random_nodes$node_demand) > total_capacity) [1] - 1
# get sum of all distances to those nodes (1 though the last)
all_distances <- apply(random_nodes[1:last,], 1, function(rn) {
euc.dist(A = hubs[h,c("x","y")],
B = rn[c("x","y")])
})
track_sums[i,h] <- sum(all_distances)
}
}
min(rowSums(track_sums))
EDIT
as a function:
hubnode <- function(nodes, hubs_required = 5, total_capacity = 120, iterations = 10){
# initialize results matrices
track_sums <- node_count <- matrix(NA, nrow = iterations, ncol = hubs_required)
colnames(track_sums) <- paste0("demand_at_hub",1:hubs_required)
colnames(node_count) <- paste0("nodes_at_hub",1:hubs_required)
# user defined distance function (only exists wihtin hubnode() function)
euc.dist <- function(A, B) round(sqrt(sum((A - B) ^ 2)))
for(i in 1:iterations){
# random hub selection
assigned_hubs <- sample(1:nrow(nodes), hubs_required, replace = FALSE)
hubs <- nodes[assigned_hubs,]
assigned_nodes <- NULL
for(h in 1:hubs_required){
# sample the nodes into a random order
assigned_nodes <- sample((1:nrow(nodes))[-assigned_hubs], replace = FALSE)
random_nodes <- nodes[assigned_nodes,]
# cumulative sum their demand, and get which number passes 120,
# and subtract 1 to get the node before that
last <- which(cumsum(random_nodes$node_demand) > total_capacity) [1] - 1
# if there are none
if(is.na(last)) last = nrow(random_nodes)
node_count[i,h] <- last
# get sum of all distances to those nodes (1 though the last)
all_distances <- apply(random_nodes[1:last,], 1, function(rn) {
euc.dist(A = hubs[h,c("x","y")],
B = rn[c("x","y")])
})
track_sums[i,h] <- sum(all_distances)
}
}
return(list(track_sums = track_sums, node_count = node_count))
}
output <- hubnode(nodes, iterations = 100)
node_count <- output$node_count
track_sums <- output$track_sums
plot(rowSums(node_count),
rowSums(track_sums), xlab = "Node Count", ylab = "Total Demand", main = paste("Result of", 100, "iterations"))
min(rowSums(track_sums))