Identify periods of high counts in timeseries data - r

Using the example dataframe:
count.bouts <-structure(list(time.stamp = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L), .Label = c("13:00:00", "13:00:10",
"13:00:20", "13:00:30", "13:00:40", "13:00:50", "13:01:00", "13:01:10",
"13:01:20", "13:01:30", "13:01:40", "13:01:50", "13:02:00", "13:02:10",
"13:02:20", "13:02:30", "13:02:40", "13:02:50", "13:03:00", "13:03:10",
"13:03:20", "13:03:30", "13:03:40", "13:03:50", "13:04:00", "13:04:10",
"13:04:20", "13:04:30", "13:04:40", "13:04:50", "13:05:00", "13:05:10",
"13:05:20", "13:05:30", "13:05:40", "13:05:50", "13:06:00"), class = "factor"),
count = c(5L, 11L, 16L, 19L, 15L, 11L, 8L, 5L, 2L, 6L, 12L,
15L, 20L, 12L, 6L, 2L, 18L, 25L, 26L, 15L, 13L, 6L, 5L, 4L,
8L, 9L, 16L, 26L, 29L, 55L, 21L, 6L, 9L, 28L, 16L, 19L, 26L,
5L)), .Names = c("time.stamp", "count"), class = "data.frame", row.names = c(NA,
-38L))
I wish to create a function that would identify bouts of high count activity that fulfils the following criteria:
Count data that is greater or equal to 10 for 1 minute or more
Within this period (or bout) of high counts, I would allow count data to drop to under 10 for a maximum of 20 seconds (within the bout)
Data that fulfils this criteria I would wish to be highlighted in the dataset by adding an extra column (called "1min+.bouts") to the dataframe. Then each bout would be identified with a number starting from 1 - i.e. the dataframe described above would have a series of 1s for the first bout (13:01:40 to 13:03:20) and then 2s for the second bout (13:04:20 to 13:05:50). 0s would be added to those rows with no bouts.
I hope that makes sense. If anyone could possible point me in the right direction re. packages or functions that would help me out, I should be most grateful.

This assumes that there are no NA values:
#which counts are >= 10
tmp <- count.bouts$count >= 10
#substitute FALSE with NA, so we can use na.approx for interpolation
tmp[!tmp] <- NA
library(zoo)
#fill gaps of up to two values
tmp <- na.approx(tmp, method = "constant", maxgap = 2, na.rm = FALSE)
#NA --> 0
tmp[is.na(tmp)] <- 0
#run lengths
tmp <- rle(tmp)
#we don't want run lengths shorter one minute
tmp$values[tmp$lengths < 6] <- 0
#number the run lengths we are interested in
tmp$values <- cumsum(tmp$values) * tmp$values
#inverse run length encoding
count.bouts$bout <- inverse.rle(tmp)
# time.stamp count bout
#1 13:00:00 5 0
#2 13:00:10 11 0
#3 13:00:20 16 0
#4 13:00:30 19 0
#5 13:00:40 15 0
#6 13:00:50 11 0
#7 13:01:00 8 0
#8 13:01:10 5 0
#9 13:01:20 2 0
#10 13:01:30 6 0
#11 13:01:40 12 1
#12 13:01:50 15 1
#13 13:02:00 20 1
#14 13:02:10 12 1
#15 13:02:20 6 1
#16 13:02:30 2 1
#17 13:02:40 18 1
#18 13:02:50 25 1
#19 13:03:00 26 1
#20 13:03:10 15 1
#21 13:03:20 13 1
#22 13:03:30 6 0
#23 13:03:40 5 0
#24 13:03:50 4 0
#25 13:04:00 8 0
#26 13:04:10 9 0
#27 13:04:20 16 2
#28 13:04:30 26 2
#29 13:04:30 29 2
#30 13:04:40 55 2
#31 13:04:50 21 2
#32 13:05:00 6 2
#33 13:05:10 9 2
#34 13:05:20 28 2
#35 13:05:30 16 2
#36 13:05:40 19 2
#37 13:05:50 26 2
#38 13:06:00 5 0

Related

tidyverse replace NA by other data frame values under condition

I have a first incomplete dataset data_incom and a second with the missing values of the first data_to_com. Using mutate(UG = case_when (INSEE == "07185" ~ 6, etc)), overwrites the "UG" column. How is it possible to replace the NA from the first dataset with the values from the second table using the tidyverse tools please?
Thank you !
data_incom <- structure(list(INSEE = c("07005", "07005", "07010", "07011",
"07011", "07012", "07019", "07025", "07026", "07032", "07033",
"07042", "07064", "07066", "07068", "07069", "07075", "07088",
"07096", "07099", "07101", "07101", "07105", "07105", "07107",
"07110", "07117", "07117", "07119", "07128", "07129", "07131",
"07144", "07153", "07154", "07159", "07161", "07161", "07168",
"07172", "07173", "07185", "07186", "07202", "07204", "07228",
"07232", "07240", "07261", "07265", "07273", "07279", "07284",
"07286", "07294", "07301", "07315", "07329", "07330", "07331",
"07338", "07338", "07347", "07187", "07265", "07334", "07262"
), UG = c(NA, NA, 2L, NA, NA, 10L, 13L, 28L, 26L, 15L, 21L, 19L,
11L, 16L, 8L, 6L, 26L, 25L, 11L, 18L, 21L, 21L, 26L, 26L, 24L,
25L, 25L, 25L, NA, 3L, 8L, 22L, 24L, NA, 28L, NA, 28L, 28L, 21L,
1L, 12L, NA, 15L, 24L, 7L, 1L, 24L, 9L, 9L, 2L, 18L, 19L, NA,
11L, 21L, 6L, NA, 24L, 18L, 28L, 8L, 8L, 3L, 24L, 2L, 20L, 24L
)), row.names = c(NA, -67L), class = "data.frame")
data_to_com <-structure(list(INSEE=c("07185", "07284", "07315", "07153", "07119", "07159", "070005"),
UG=c(6L,20L,24L,28L,26L,15L,17L)), row.names = c(NA,7L), class = "data.frame")
You can use the following solution. There are some INSEE values in the first data set that weren't present in the second data set and I just left them as NA values.
library(dplyr)
library(tidyr)
data_incom %>%
filter(is.na(UG)) %>%
rowwise() %>%
mutate(UG = list(data_to_com$UG[grepl(INSEE, data_to_com$INSEE)])) %>%
unnest(cols = c(UG)) -> data_com
data_com %>%
bind_rows(data_incom %>%
filter(!INSEE %in% data_com$INSEE)) %>%
arrange(INSEE)
# A tibble: 67 x 2
INSEE UG
<chr> <int>
1 07005 NA
2 07005 NA
3 07010 2
4 07011 NA
5 07011 NA
6 07012 10
7 07019 13
8 07025 28
9 07026 26
10 07032 15
# ... with 57 more rows
using coalesce in these kind of scenarios.
Using left_join will result in inclusion of all rows from incom
use coalesce thereafter
further use .keep = 'unused' in mutate argument to retain wanted rows only
library(dplyr)
data_incom %>% left_join(data_to_com, by = 'INSEE') %>%
mutate(UG = coalesce(UG.x, UG.y), .keep = 'unused')
INSEE UG
1 07005 NA
2 07005 NA
3 07010 2
4 07011 NA
5 07011 NA
6 07012 10
7 07019 13
8 07025 28
9 07026 26
10 07032 15
11 07033 21
12 07042 19
13 07064 11
14 07066 16
15 07068 8
16 07069 6
17 07075 26
18 07088 25
19 07096 11
20 07099 18
21 07101 21
22 07101 21
23 07105 26
24 07105 26
25 07107 24
26 07110 25
27 07117 25
28 07117 25
29 07119 26
30 07128 3
31 07129 8
32 07131 22
33 07144 24
34 07153 28
35 07154 28
36 07159 15
37 07161 28
38 07161 28
39 07168 21
40 07172 1
41 07173 12
42 07185 6
43 07186 15
44 07202 24
45 07204 7
46 07228 1
47 07232 24
48 07240 9
49 07261 9
50 07265 2
51 07273 18
52 07279 19
53 07284 20
54 07286 11
55 07294 21
56 07301 6
57 07315 24
58 07329 24
59 07330 18
60 07331 28
61 07338 8
62 07338 8
63 07347 3
64 07187 24
65 07265 2
66 07334 20
67 07262 24

how can I chop strings in a consecutive region with various moving window

I have a data like this
df<- structure(list(sname = structure(2:1, .Label = c("Carrot", "Melon"
), class = "factor"), sence = structure(1:2, .Label = c("RSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"TGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Lets look at the first row
Melon RSNSNASSAVSTSCVSNRAMKGTTHYDTS
I want to be able to chop the strings into different windows as well as moving in different pattern. for example lets say moving 1 letter at the time and windows of 10. so The first output will be like this
RSNSNASSAV
So this one is letter 1 ,2,3,4,5,6,7,8,9,10
The second one will be moving 1 letter forward and then chop for 10 letters
SNSNASSAVS
so this is letter 2,3,4,5,6,7,8,9,10,11
it goes until the end.
a requested output is like the following
output<- structure(list(position = structure(c(33L, 1L, 12L, 23L, 26L,
27L, 28L, 29L, 30L, 31L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 32L, 1L, 12L, 23L, 26L, 27L, 28L, 29L, 30L, 31L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 24L, 25L), .Label = c("1,2,3,4,5,6,7,8,9,10",
"10,11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,20",
"12,13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"16,17,18,19,20,21,22,23,24,25", "17,18,19,20,21,22,23,24,25,26",
"18,19,20,21,22,23,24,25,26,27", "19,20,21,22,23,24,25,26,27,28",
"2,3,4,5,6,7,8,9,10,11", "20,21,22,23,24,25,26,27,28,29", "21,22,23,24,25,26,27,28,29,30",
"22,23,24,25,26,27,28,29,30,31", "23,24,25,26,27,28,29,30,31,32",
"24,25,26,27,28,29,30,31,32,33", "25,26,27,28,29,30,31,32,33,34",
"26,27,28,29,30,31,32,33,34,35", "27,28,29,30,31,32,33,34,35,36",
"28,29,30,31,32,33,34,35,36,37", "29,30,31,32,33,34,35,36,37,38",
"3,4,5,6,7,8,9,10,11,12", "30,31,32,33,34,35,36,37,38,39", "31,32,33,34,35,36,37,38,39,40",
"4,5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,14,15",
"7,8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17",
"9,10,11,12,13,14,15,16,17,18", "Carrot", "Melon"), class = "factor"),
name = structure(c(20L, 32L, 37L, 26L, 35L, 35L, 2L, 38L,
33L, 3L, 46L, 39L, 42L, 34L, 7L, 45L, 36L, 24L, 27L, 1L,
21L, 5L, 41L, 15L, 22L, 28L, 17L, 14L, 16L, 23L, 47L, 40L,
43L, 6L, 19L, 8L, 19L, 8L, 48L, 44L, 10L, 12L, 25L, 31L,
30L, 29L, 18L, 50L, 13L, 4L, 49L, 9L, 11L), .Label = c("AMKGTTHYDT",
"ASSAVSTSCV", "AVSTSCVSNR", "AYDDHHRGGV", "Carrot", "CMCVVDDNRR",
"CVSNRAMKGT", "CVVDDNRRRH", "DDHHRGGVCT", "DDNRRRHYNG", "DHHRGGVCTS",
"DNRRRHYNGA", "GAYDDHHRGG", "GGMVSVCMCV", "GMRHGGMVSV", "GMVSVCMCVV",
"HGGMVSVCMC", "HYNGAYDDHH", "MCVVDDNRRR", "Melon", "MKGTTHYDTS",
"MRHGGMVSVC", "MVSVCMCVVD", "NRAMKGTTHY", "NRRRHYNGAY", "NSNASSAVST",
"RAMKGTTHYD", "RHGGMVSVCM", "RHYNGAYDDH", "RRHYNGAYDD", "RRRHYNGAYD",
"RSNSNASSAV", "SAVSTSCVSN", "SCVSNRAMKG", "SNASSAVSTS", "SNRAMKGTTH",
"SNSNASSAVS", "SSAVSTSCVS", "STSCVSNRAM", "SVCMCVVDDN", "TGMRHGGMVS",
"TSCVSNRAMK", "VCMCVVDDNR", "VDDNRRRHYN", "VSNRAMKGTT", "VSTSCVSNRA",
"VSVCMCVVDD", "VVDDNRRRHY", "YDDHHRGGVC", "YNGAYDDHHR"), class = "factor")), class = "data.frame", row.names = c(NA,
-53L))
Split with 2
RSNSNASSAV
NSNASSAVST
NASSAVSTSC
SSAVSTSCVS
AVSTSCVSNR
STSCVSNRAM
SCVSNRAMKG
VSNRAMKGTT
NRAMKGTTHY
AMKGTTHYDT
KGTTHYDTS
We convert the factor columns to character, then transmute to createa tibble of 'position', 'name' by looping over the rows with map, create substrings based on the the split width 'n' and the number of character (nchar) of 'sence', concatenate the 'sname' as the first element and unnest the list output to create a two column dataset
library(tidyverse)
f1 <- function(dat, n, mv = 1) {
dat %>%
mutate_all(as.character) %>%
transmute(out = map2(sence, sname, ~ {
i1 <- seq_len(nchar(.x) - (n -1))
i11 <- seq(i1[1], i1[length(i1)], by = mv)
i2 <- n:nchar(.x)
i22 <- seq(i2[1], i2[length(i2)], by = mv)
tibble(position = c(.y, map2_chr(i11, i22, ~
str_c(seq(.x, .y), collapse=","))),
name = c(.y, substring(.x, i11, i22)))
})) %>%
unnest
}
-testing
- moving window - 1
f1(df, n = 10, mv = 1)
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 2,3,4,5,6,7,8,9,10,11 SNSNASSAVS
#4 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#5 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#6 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#7 6,7,8,9,10,11,12,13,14,15 ASSAVSTSCV
#8 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#9 8,9,10,11,12,13,14,15,16,17 SAVSTSCVSN
#10 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#11 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#12 11,12,13,14,15,16,17,18,19,20 STSCVSNRAM
#13 12,13,14,15,16,17,18,19,20,21 TSCVSNRAMK
#14 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#15 14,15,16,17,18,19,20,21,22,23 CVSNRAMKGT
#16 15,16,17,18,19,20,21,22,23,24 VSNRAMKGTT
#17 16,17,18,19,20,21,22,23,24,25 SNRAMKGTTH
#18 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
#19 18,19,20,21,22,23,24,25,26,27 RAMKGTTHYD
#20 19,20,21,22,23,24,25,26,27,28 AMKGTTHYDT
#21 20,21,22,23,24,25,26,27,28,29 MKGTTHYDTS
#22 Carrot Carrot
#23 1,2,3,4,5,6,7,8,9,10 TGMRHGGMVS
#24 2,3,4,5,6,7,8,9,10,11 GMRHGGMVSV
#25 3,4,5,6,7,8,9,10,11,12 MRHGGMVSVC
#26 4,5,6,7,8,9,10,11,12,13 RHGGMVSVCM
#27 5,6,7,8,9,10,11,12,13,14 HGGMVSVCMC
#28 6,7,8,9,10,11,12,13,14,15 GGMVSVCMCV
#29 7,8,9,10,11,12,13,14,15,16 GMVSVCMCVV
#30 8,9,10,11,12,13,14,15,16,17 MVSVCMCVVD
#31 9,10,11,12,13,14,15,16,17,18 VSVCMCVVDD
#32 10,11,12,13,14,15,16,17,18,19 SVCMCVVDDN
#33 11,12,13,14,15,16,17,18,19,20 VCMCVVDDNR
#34 12,13,14,15,16,17,18,19,20,21 CMCVVDDNRR
#35 13,14,15,16,17,18,19,20,21,22 MCVVDDNRRR
#36 14,15,16,17,18,19,20,21,22,23 CVVDDNRRRH
#37 15,16,17,18,19,20,21,22,23,24 VVDDNRRRHY
#38 16,17,18,19,20,21,22,23,24,25 VDDNRRRHYN
#39 17,18,19,20,21,22,23,24,25,26 DDNRRRHYNG
#40 18,19,20,21,22,23,24,25,26,27 DNRRRHYNGA
#41 19,20,21,22,23,24,25,26,27,28 NRRRHYNGAY
#42 20,21,22,23,24,25,26,27,28,29 RRRHYNGAYD
#43 21,22,23,24,25,26,27,28,29,30 RRHYNGAYDD
#44 22,23,24,25,26,27,28,29,30,31 RHYNGAYDDH
#45 23,24,25,26,27,28,29,30,31,32 HYNGAYDDHH
#46 24,25,26,27,28,29,30,31,32,33 YNGAYDDHHR
#47 25,26,27,28,29,30,31,32,33,34 NGAYDDHHRG
#48 26,27,28,29,30,31,32,33,34,35 GAYDDHHRGG
#49 27,28,29,30,31,32,33,34,35,36 AYDDHHRGGV
#50 28,29,30,31,32,33,34,35,36,37 YDDHHRGGVC
#51 29,30,31,32,33,34,35,36,37,38 DDHHRGGVCT
#52 30,31,32,33,34,35,36,37,38,39 DHHRGGVCTS
-moving window - 2
f1(df, n = 10, mv = 2) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#4 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#5 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#6 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
-moving window - 3
f1(df, n = 10, mv = 3) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#4 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#5 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#6 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
-moving window - 4
f1(df, n = 10, mv = 4) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#4 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#5 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#6 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
library('tidyverse')
# use this function to make the blocks:
make_substrings = function(string, len, label){
# set up the indices
str_len = nchar(string)
indices1 = 1:(str_len-len+1)
indices2 = (len:str_len)
# create the list of indices
position = map2_chr(indices1, indices2, .f = function(x, y){paste(x:y, collapse = ', ')})
# take substrings
name = map2_chr(indices1, indices2, .f = substr, x = string)
# add yoru food labels
header = tibble(position = label,
name = label)
header %>%
bind_rows(tibble(position,
name))
}
# your version had factors
df = df %>%
mutate_all(as.character)
# iterate over all the rows of df:
output = Map(f = make_substrings, string = df$sence, len = 10, label = df$sname) %>%
bind_rows

Finding columns that contain values based on another column

I have the following data frame:
Step 1 2 3
1 5 10 6
2 5 11 5
3 5 13 9
4 5 15 10
5 13 18 10
6 15 20 10
7 17 23 10
8 19 25 10
9 21 27 13
10 23 30 7
I would like to retrieve the columns that satisfy one of the following conditions: if step 1 = step 4 or step 4 = step 8. In this case, column 1 and 3 should be retrieved. Column 1 because the value at Step 1 = value at step 4 (i.e., 5), and for column 3, the value at step 4 = value at step 8 (i.e., 10).
I don't know how to do that in R. Can someone help me please?
You can get the column indices by the following code:
df[1, -1] == df[4, -1] | df[4, -1] == df[8, -1]
# X1 X2 X3
# 1 TRUE FALSE TRUE
# data
df <- structure(list(Step = 1:10, X1 = c(5L, 5L, 5L, 5L, 13L, 15L,
17L, 19L, 21L, 23L), X2 = c(10L, 11L, 13L, 15L, 18L, 20L, 23L,
25L, 27L, 30L), X3 = c(6L, 5L, 9L, 10L, 10L, 10L, 10L, 10L, 13L,
7L)), class = "data.frame", row.names = c(NA, -10L))

Loops with random sampling from a matrix and distance calculation

I got a list of nodes, and I need to randomly assign 'p' hubs to 'n' clients.
I got the following data, where the first row shows:
The total number of nodes.
The requested number of hubs.
The total supply capacity for each hub.
The following lines show:
The first column the node number.
The second column the "x" coordinate.
The third the "y" coordinate.
Below I will show the raw data, adding colnames() it would look something like this:
total_nodes hubs_required total_capacity
50 5 120
node number x_coordinate y_coordinate node_demand
1 2 62 3
2 80 25 14
3 36 88 1
4 57 23 14
. . . .
. . . .
. . . .
50 1 58 2
The x and y values are provided so we can calculate the Euclidean distance.
nodes:
50 5 120
1 2 62 3
2 80 25 14
3 36 88 1
4 57 23 14
5 33 17 19
6 76 43 2
7 77 85 14
8 94 6 6
9 89 11 7
10 59 72 6
11 39 82 10
12 87 24 18
13 44 76 3
14 2 83 6
15 19 43 20
16 5 27 4
17 58 72 14
18 14 50 11
19 43 18 19
20 87 7 15
21 11 56 15
22 31 16 4
23 51 94 13
24 55 13 13
25 84 57 5
26 12 2 16
27 53 33 3
28 53 10 7
29 33 32 14
30 69 67 17
31 43 5 3
32 10 75 3
33 8 26 12
34 3 1 14
35 96 22 20
36 6 48 13
37 59 22 10
38 66 69 9
39 22 50 6
40 75 21 18
41 4 81 7
42 41 97 20
43 92 34 9
44 12 64 1
45 60 84 8
46 35 100 5
47 38 2 1
48 9 9 7
49 54 59 9
50 1 58 2
I extracted the information from the first line.
nodes <- as.matrix(read.table(data))
header<-colnames(nodes)
clean_header <-gsub('X','',header)
requested_hubs <- as.numeric(clean_header[2])
max_supply_capacity <- as.numeric(clean_header[3])
I need to randomly select 5 nodes, that will act as hubs
set.seed(37)
node_to_hub <-nodes[sample(nrow(nodes),requested_hubs,replace = FALSE),]
Then randomly I need to assign nodes to each hub calculate the distances between the hub and each one of the nodes and when the max_supply_capacity(120) is exceeded select the following hub and repeat the process.
After the final iteration I need to return the cumulative sum of distances for all the hubs.
I need to repeat this process 100 times and return the min() value of the cumulative sum of distances.
This is where I'm completely stuck since I'm not sure how to loop through a matrix let alone when I have to select elements randomly.
I got the following elements:
capacity <- c(numeric()) # needs to be <= to 120
distance_sum <- c(numeric())
global_hub_distance <- c(numeric())
The formula for the euclidean distance (rounded) would be as below but I'm not sure how I can reflect the random selection when assigning nodes.
distance <-round(sqrt(((node_to_hub[i,2]-nodes[i,2]))^2+(node_to_hub[random,3]-nodes[random,3])^2))
The idea for the loop I think I need is below, but as I mentioned before I don't know how to deal with the sample client selection, and the distance calculation of the random clients.
for(i in 1:100){
node_to_hub
for(i in 1:nrow(node_to_hub){
#Should I randomly sample the clients here???
while(capacity < 120){
node_demand <- nodes[**random**,3]
distance <-round(sqrt(((node_to_hub[i,2]-nodes[i,2]))^2+(node_to_hub[**random**,3]-nodes[**random**,3])^2))
capacity <-c(capacity, node_demand)
distance_sum <- c(distance_sum,distance)
}
global_hub_distance <- c(global_hub_distance,distance_sum)
capacity <- 0
distance_sum <- 0
}
min(global_hub_distance)
}
Not EXACTLY sure what you are looking for but this code may be able to help you. It's not extremely fast, as instead of using a while to stop after hitting your total_capacity it just does a cumsum on the full node list and find the place where you exceed 120.
nodes <- structure(list(node_number = 1:50,
x = c(2L, 80L, 36L, 57L, 33L, 76L, 77L, 94L,
89L, 59L, 39L, 87L, 44L, 2L, 19L, 5L,
58L, 14L, 43L, 87L, 11L, 31L, 51L, 55L,
84L, 12L, 53L, 53L, 33L, 69L, 43L, 10L,
8L, 3L, 96L, 6L, 59L, 66L, 22L, 75L, 4L,
41L, 92L, 12L, 60L, 35L, 38L, 9L, 54L, 1L),
y = c(62L, 25L, 88L, 23L, 17L, 43L, 85L, 6L, 11L,
72L, 82L, 24L, 76L, 83L, 43L, 27L, 72L, 50L,
18L, 7L, 56L, 16L, 94L, 13L, 57L, 2L, 33L, 10L,
32L, 67L, 5L, 75L, 26L, 1L, 22L, 48L, 22L, 69L,
50L, 21L, 81L, 97L, 34L, 64L, 84L, 100L, 2L, 9L, 59L, 58L),
node_demand = c(3L, 14L, 1L, 14L, 19L, 2L, 14L, 6L,
7L, 6L, 10L, 18L, 3L, 6L, 20L, 4L,
14L, 11L, 19L, 15L, 15L, 4L, 13L,
13L, 5L, 16L, 3L, 7L, 14L, 17L,
3L, 3L, 12L, 14L, 20L, 13L, 10L,
9L, 6L, 18L, 7L, 20L, 9L, 1L, 8L,
5L, 1L, 7L, 9L, 2L)),
.Names = c("node_number", "x", "y", "node_demand"),
class = "data.frame", row.names = c(NA, -50L))
total_nodes = nrow(nodes)
hubs_required = 5
total_capacity = 120
iterations <- 100
track_sums <- matrix(NA, nrow = iterations, ncol = hubs_required)
colnames(track_sums) <- paste0("demand_at_hub",1:hubs_required)
And then I prefer using a function for distance, in this case A and B are 2 separate vectors with c(x,y) and c(x,y).
euc.dist <- function(A, B) round(sqrt(sum((A - B) ^ 2))) # distances
The Loop:
for(i in 1:iterations){
# random hub selection
hubs <- nodes[sample(1:total_nodes, hubs_required, replace = FALSE),]
for(h in 1:hubs_required){
# sample the nodes into a random order
random_nodes <- nodes[sample(1:nrow(nodes), size = nrow(nodes), replace = FALSE),]
# cumulative sum their demand, and get which number passes 120,
# and subtract 1 to get the node before that
last <- which(cumsum(random_nodes$node_demand) > total_capacity) [1] - 1
# get sum of all distances to those nodes (1 though the last)
all_distances <- apply(random_nodes[1:last,], 1, function(rn) {
euc.dist(A = hubs[h,c("x","y")],
B = rn[c("x","y")])
})
track_sums[i,h] <- sum(all_distances)
}
}
min(rowSums(track_sums))
EDIT
as a function:
hubnode <- function(nodes, hubs_required = 5, total_capacity = 120, iterations = 10){
# initialize results matrices
track_sums <- node_count <- matrix(NA, nrow = iterations, ncol = hubs_required)
colnames(track_sums) <- paste0("demand_at_hub",1:hubs_required)
colnames(node_count) <- paste0("nodes_at_hub",1:hubs_required)
# user defined distance function (only exists wihtin hubnode() function)
euc.dist <- function(A, B) round(sqrt(sum((A - B) ^ 2)))
for(i in 1:iterations){
# random hub selection
assigned_hubs <- sample(1:nrow(nodes), hubs_required, replace = FALSE)
hubs <- nodes[assigned_hubs,]
assigned_nodes <- NULL
for(h in 1:hubs_required){
# sample the nodes into a random order
assigned_nodes <- sample((1:nrow(nodes))[-assigned_hubs], replace = FALSE)
random_nodes <- nodes[assigned_nodes,]
# cumulative sum their demand, and get which number passes 120,
# and subtract 1 to get the node before that
last <- which(cumsum(random_nodes$node_demand) > total_capacity) [1] - 1
# if there are none
if(is.na(last)) last = nrow(random_nodes)
node_count[i,h] <- last
# get sum of all distances to those nodes (1 though the last)
all_distances <- apply(random_nodes[1:last,], 1, function(rn) {
euc.dist(A = hubs[h,c("x","y")],
B = rn[c("x","y")])
})
track_sums[i,h] <- sum(all_distances)
}
}
return(list(track_sums = track_sums, node_count = node_count))
}
output <- hubnode(nodes, iterations = 100)
node_count <- output$node_count
track_sums <- output$track_sums
plot(rowSums(node_count),
rowSums(track_sums), xlab = "Node Count", ylab = "Total Demand", main = paste("Result of", 100, "iterations"))
min(rowSums(track_sums))

Can corr.test be used for any dataframe?

I am trying to find get correlations and p-values between variables in a dataframe (df1) using corr.test in the psych package. The variables in the dataframe are all integers and there is no NAs. But when I run the corr.test(df1), there is always a error message.
Error in data.frame(lower = lower, r = r[lower.tri(r)], upper = upper, :
arguments imply differing number of rows: 0, 28
I tried to run the example (corr.test(sat.act)) in the psych package and there is no error.
I am new to R, can someone tell me what is wrong with the dataframe.
> head(df1)
S1.pre S2.pre S1.post S2.post V1.pre V2.pre V1.post V2.post
1 21 31 25 35 7 1 19 4
2 15 26 21 29 13 11 16 14
3 18 27 23 31 8 2 3 3
4 17 31 18 39 13 11 15 14
5 15 26 16 29 26 15 32 20
6 17 28 16 28 2 4 2 7
> dput(head(df1))
structure(list(S1.pre = c(21L, 15L, 18L, 17L, 15L, 17L), S2.pre = c(31L,
26L, 27L, 31L, 26L, 28L), S1.post = c(25L, 21L, 23L, 18L, 16L,
16L), S2.post = c(35L, 29L, 31L, 39L, 29L, 28L), V1.pre = c(7L,
13L, 8L, 13L, 26L, 2L), V2.pre = c(1L, 11L, 2L, 11L, 15L, 4L),
V1.post = c(19L, 16L, 3L, 15L, 32L, 2L), V2.post = c(4L,
14L, 3L, 14L, 20L, 7L)), .Names = c("S1.pre", "S2.pre", "S1.post",
"S2.post", "V1.pre", "V2.pre", "V1.post", "V2.post"), row.names = c(NA,
6L), class = "data.frame")
> sapply(df1, class)
S1.pre S2.pre S1.post S2.post V1.pre V2.pre V1.post V2.post
"integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer"
I contacted William Revelle - author of the psych package and here is what he said:
Mark,
Unfotunately you found a bug introduced into 1.4.3.
1.4.4 will go out to Cran this weekend.
In the meantime you can get the fix at http://personality-project.org/r (choose source from other repository if you are using a mac) or
http://personality-project.org/r/src/contrib and get the zip file if you are using a PC.
Otherwise, wait until next week.
Sorry about the problem.
It will still work as long as you have unequal number of subjects or some missing data.

Resources