Manhattan Matrix by 2 Matrices is non symmetric but should be - r

I created two matrices that have random integers as components, the dimension of the matrix doesn't matter. Then I want to calculate the distance matrix by the Manhattan method and frame it as a matrix. The matrix should be symmetric, but when I frame it as a matrix, the output is a non
symmetric distance matrix.
By that matrix (that should be the output) I want to calculate a cluster.
Where is my mistake?
Code:
a <- c(sample.int(30,6))
b <- c(sample.int(30,6))
c <- c(sample.int(30,6))
d <- c(sample.int(30,6))
e <- c(sample.int(30,6))
f <- c(sample.int(30,6))
V2 <- rbind(a,b,c,d,e,f)
V1 <- rbind(a,b,c,d,e,f)
d1MNR <- matrix(dist(Vorlage1,Vorlage2, method="manhattan")) #### Is non symmetric
d1MR <- matrix(dist(V1,V2,upper=TRUE, diag=TRUE ,method="manhattan")) #### Should be symmetric, but is not
d1MR ### Generate output
hclust <- hclust(dist(d1MR), method = "single") ### Clustering

You can make a symmetrical distance matrix from V1 or a symmetrical matrix from V2, but the only way to make a symmetric matrix from both of them together is to combine them V12 <- rbind(V1, V2). The dist() function returns a dist object that hclus can use. You do not need to convert them to a matrix. In your example V1 and V2 are identical. We need them to be different:
set.seed(42)
V1 <- matrix(sample.int(30, 36, replace=TRUE), 6)
V2 <- matrix(sample.int(30, 36, replace=TRUE), 6)
V12 <- rbind(V1, V2)
rownames(V12) <- paste(rep(c("V1", "V2"), each=6), 1:6, sep=":")
colnames(V12) <- letters[1:6]
V12
# a b c d e f
# V1:1 17 18 4 18 4 28
# V1:2 5 26 25 15 5 8
# V1:3 1 17 5 3 13 3
# V1:4 25 15 14 9 5 26
# V1:5 10 24 20 25 20 1
# V1:6 4 7 26 27 2 10
# V2:1 24 8 28 3 18 22
# V2:2 30 4 5 24 6 21
# V2:3 11 4 4 23 6 2
# V2:4 15 22 2 17 2 23
# V2:5 22 18 24 21 20 6
# V2:6 26 13 18 26 3 26
d1MNR <- dist(V12, method="manhattan")
hclust <- hclust(d1MNR, method = "single")
plot(hclust)
If you want to look at a symmetrical distance matrix:
print(d1MNR, upper=TRUE, diag=TRUE)
# V1:1 V1:2 V1:3 V1:4 V1:5 V1:6 V2:1 V2:2 V2:3 V2:4 V2:5 V2:6
# V1:1 0 65 67 33 79 75 76 43 53 16 66 39
# V1:2 65 0 58 66 44 38 79 90 64 57 49 72
# V1:3 67 58 0 72 62 76 79 88 52 67 69 98
# V1:4 33 66 72 0 86 78 45 46 74 43 63 26
# V1:5 79 44 62 86 0 58 83 90 54 73 31 72
# V1:6 75 38 76 78 58 0 75 68 48 73 59 54
# V2:1 76 79 79 45 83 75 0 67 93 80 52 59
# V2:2 43 90 88 46 90 68 67 0 40 49 73 36
# V2:3 53 64 52 74 54 48 93 40 0 55 65 68
# V2:4 16 57 67 43 73 73 80 49 55 0 72 49
# V2:5 66 49 69 63 31 59 52 73 65 72 0 57
# V2:6 39 72 98 26 72 54 59 36 68 49 57 0

Related

Create new variables by dividing all pre-exisiting variables by all other variables

I would like to create new variables by dividing all pre-existing variables by each other
e.g.
X1/X1, X1/X2, X1/X3, X1/X4, X1/X5, X1/X6, X1/X7, X1/X8, X1/X9, X1/X10,
X2/X1, X2/X2, X2/X3, X2/X4, X2/X5, X2/X6, X2/X7, X2/X8, X2/X9, X2/X10,
X3/X1, X3/X2 ...
I started by trying to do each individually, as below, but I need to replicate this with multiple variable names so an automation (I assume a function/lapply) would be ideal.
ds$rom_3_5m <- (ds$roll_open_mean_3m/ds$roll_open_mean_5m)
ds$rom_3_10m <- (ds$roll_open_mean_3m/ds$roll_open_mean_10m)
ds$rom_3_15m <- (ds$roll_open_mean_3m/ds$roll_open_mean_15m)
ds$rom_3_30m <- (ds$roll_open_mean_3m/ds$roll_open_mean_30m)
ds$rom_3_60m <- (ds$roll_open_mean_3m/ds$roll_open_mean_60m)
ds$rom_3_120m <- (ds$roll_open_mean_3m/ds$roll_open_mean_120m)
ds$rom_3_240m <- (ds$roll_open_mean_3m/ds$roll_open_mean_240m)
ds$rom_3_480m <- (ds$roll_open_mean_3m/ds$roll_open_mean_480m)
ds$rom_3_960m <- (ds$roll_open_mean_3m/ds$roll_open_mean_960m)
ds$rom_3_1920m <- (ds$roll_open_mean_3m/ds$roll_open_mean_1920m)
ds$rom_3_3840m <- (ds$roll_open_mean_3m/ds$roll_open_mean_3840m)
ds$rom_3_7680m <- (ds$roll_open_mean_3m/ds$roll_open_mean_7680m)
ds$rom_3_15360m <- (ds$roll_open_mean_3m/ds$roll_open_mean_15360m)
ds$rom_3_30720m <- (ds$roll_open_mean_3m/ds$roll_open_mean_30720m)
ds$rom_3_61440m <- (ds$roll_open_mean_3m/ds$roll_open_mean_61440m)
ds$rom_3_122880m <- (ds$roll_open_mean_3m/ds$roll_open_mean_122880m)
ds$rom_3_245760m <- (ds$roll_open_mean_3m/ds$roll_open_mean_245760m)
ds$rom_3_491520m <- (ds$roll_open_mean_3m/ds$roll_open_mean_491520m)
#5m
ds$rom_5_3m <- (ds$roll_open_mean_5m/ds$roll_open_mean_3m)
ds$rom_5_10m <- (ds$roll_open_mean_5m/ds$roll_open_mean_10m)
ds$rom_5_15m <- (ds$roll_open_mean_5m/ds$roll_open_mean_15m)
ds$rom_5_30m <- (ds$roll_open_mean_5m/ds$roll_open_mean_30m)
ds$rom_5_60m <- (ds$roll_open_mean_5m/ds$roll_open_mean_60m)
ds$rom_5_120m <- (ds$roll_open_mean_5m/ds$roll_open_mean_120m)
ds$rom_5_240m <- (ds$roll_open_mean_5m/ds$roll_open_mean_240m)
ds$rom_5_480m <- (ds$roll_open_mean_5m/ds$roll_open_mean_480m)
ds$rom_5_960m <- (ds$roll_open_mean_5m/ds$roll_open_mean_960m)
ds$rom_5_1920m <- (ds$roll_open_mean_5m/ds$roll_open_mean_1920m)
ds$rom_5_3840m <- (ds$roll_open_mean_5m/ds$roll_open_mean_3840m)
ds$rom_5_7680m <- (ds$roll_open_mean_5m/ds$roll_open_mean_7680m)
ds$rom_5_15360m <- (ds$roll_open_mean_5m/ds$roll_open_mean_15360m)
ds$rom_5_30720m <- (ds$roll_open_mean_5m/ds$roll_open_mean_30720m)
ds$rom_5_61440m <- (ds$roll_open_mean_5m/ds$roll_open_mean_61440m)
ds$rom_5_122880m <- (ds$roll_open_mean_5m/ds$roll_open_mean_122880m)
ds$rom_5_245760m <- (ds$roll_open_mean_5m/ds$roll_open_mean_245760m)
ds$rom_5_491520m <- (ds$roll_open_mean_5m/ds$roll_open_mean_491520m)
#10m
ds$rom_10_3m <- (ds$roll_open_mean_10m/ds$roll_open_mean_3m)
ds$rom_10_5m <- (ds$roll_open_mean_10m/ds$roll_open_mean_5m)
ds$rom_10_15m <- (ds$roll_open_mean_10m/ds$roll_open_mean_15m)
I have a data frame with 40+ variables with 6 million rows, I have attached a smaller example data frame below.
Thanks in advance!
Charlie
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 57 77 48 8 31 43 47 13 26 88
2 25 75 86 77 4 65 5 49 31 57
3 91 90 42 69 82 33 56 99 47 39
4 35 96 86 77 67 77 20 17 77 92
5 6 100 50 62 16 31 0 39 72 4
6 90 34 74 89 71 37 73 45 24 28
7 24 22 92 13 57 97 32 2 12 80
8 74 59 49 2 97 100 15 37 15 67
9 43 38 66 97 8 20 85 25 97 67
10 82 4 56 40 42 46 44 98 98 76
11 60 68 92 99 81 92 78 59 23 81
12 22 57 37 100 7 1 89 41 40 56
13 69 13 1 82 89 45 83 24 71 29
14 8 14 66 48 94 8 20 3 28 63
15 26 70 56 62 9 34 11 86 71 64
16 7 55 15 100 91 89 46 74 98 14
17 29 68 19 66 83 29 84 76 90 45
18 27 76 6 48 17 28 8 7 52 37
19 68 58 51 75 60 57 74 46 98 93
20 15 15 89 55 23 3 3 8 32 37
21 78 49 57 48 96 89 4 95 67 58
22 12 36 42 59 27 92 48 0 92 28
23 51 17 77 61 84 53 46 22 27 36
24 40 84 83 35 19 13 80 78 96 87
25 44 80 25 72 43 17 74 70 52 36
26 14 61 63 82 16 47 32 93 19 84
27 93 19 28 62 74 1 85 65 50 9
28 80 62 6 58 48 97 97 18 65 43
29 12 58 95 79 37 89 89 83 22 85
30 57 73 22 88 99 63 58 87 90 66
As #27 ϕ 9 suggested in the comments you should use that lapply solution.
With this, you also create a unique dataframe with correct names
l <- lapply(df, `/`, df)
l <- unlist(l, recursive = FALSE)
data.frame(l)

How to cut the values in a regular interval and define them into the separate group? [duplicate]

This question already has answers here:
Split a vector into chunks
(22 answers)
Closed 3 years ago.
How to cut the values (1 to 100) in a regular interval (25) and place them into 4 groups as below:
sdr <- c(1:100)
Group1: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
Group2: 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
Group3: 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
Group4: 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Any suggestion, please.
You could use split
sdr <- 1:100
split(sdr, rep(1:4, each = 25))
#$`1`
# [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#
#$`2`
# [1] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#
#$`3`
# [1] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#
#$`4`
# [1] 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#[20] 95 96 97 98 99 100
This returns a list with 4 vector elements.
Also note that the c() around 1:100 is not necessary.
Or we can define the number of groups
ngroup <- 4
split(sdr, rep(1:ngroup, each = length(sdr) %/% ngroup))
giving the same result.
You can make a dataframe for your groups and then transpose using t:
df <- t(data.frame(Group1 = c(1:25), Group2 = c(26:50), Group3 = c(51:75), Group4 = c(76:100)))

Subsetting Data frame or matrix based on criteria of values

Suppose I have a matrix or a data frame and I want only those values that are greater than 15 and no values between 85 and 90 both inclusive
a<-matrix(1:100,nrow = 10, ncol = 10)
rownames(a) <- LETTERS[1:10]
colnames(a) <- LETTERS[1:10]
A B C D E F G H I J
A 1 11 21 31 41 51 61 71 81 91
B 2 12 22 32 42 52 62 72 82 92
C 3 13 23 33 43 53 63 73 83 93
D 4 14 24 34 44 54 64 74 84 94
E 5 15 25 35 45 55 65 75 85 95
F 6 16 26 36 46 56 66 76 86 96
G 7 17 27 37 47 57 67 77 87 97
H 8 18 28 38 48 58 68 78 88 98
I 9 19 29 39 49 59 69 79 89 99
J 10 20 30 40 50 60 70 80 90 100
Note: You can convert it into dataframe if you know this kind of operation is possible in dataframe
Now I want My result in such a format that only those values that are greater than 5 and less than 85 retain and all else got deleted and replaced with blank space.
My desired out is like below
A B C D E F G H I J
A 11 21 31 41 51 61 71 81 91
B 12 22 32 42 52 62 72 82 92
C 13 23 33 43 53 63 73 83 93
D 14 24 34 44 54 64 74 84 94
E 5 15 25 35 45 55 65 75 85 95
F 6 16 26 36 46 56 66 76 96
G 7 17 27 37 47 57 67 77 97
H 8 18 28 38 48 58 68 78 98
I 9 19 29 39 49 59 69 79 99
J 10 20 30 40 50 60 70 80 100
Is there any kind of function in R which can take my condition and produce the desired result. I want to change code according to problem . I searched it over stack flow but didn't find something like this. I don't want to format based on rows or column.
I tried
a[a> 5 & a!=c(85:90)]
but this give me values and looses the structure.
Assuming that the 'a' is matrix, we can assign the values of 'a' %in% 86:90 or | less than 5 (a < 5) to NA. Here, I am not assigning it to '' as it will change the class from numeric to character. Also, assigning to NA would be useful for later processing.
a[a %in% 86:90 | a<5] <- NA
However, if we need it to be ''
a[a %in% 86:90 | a<5] <- ""
If we are using a data.frame
a1 <- as.data.frame(a)
a1[] <- lapply(a1, function(x) replace(x, x %in% 86:90| x <5, ""))
a1
# A B C D E F G H I J
#A 11 21 31 41 51 61 71 81 91
#B 12 22 32 42 52 62 72 82 92
#C 13 23 33 43 53 63 73 83 93
#D 14 24 34 44 54 64 74 84 94
#E 5 15 25 35 45 55 65 75 85 95
#F 6 16 26 36 46 56 66 76 96
#G 7 17 27 37 47 57 67 77 97
#H 8 18 28 38 48 58 68 78 98
#I 9 19 29 39 49 59 69 79 99
#J 10 20 30 40 50 60 70 80 100
NOTE: This changes the class of each column to character
In the OP's code, a!=c(85:90) will not work as intended as the 85:90 will recycle to the length of the 'a' and the comparison will be between the corresponding values in the recycled value and 'a'. Instead, we need to use %in% for a vector with length > 1.

subsetting between two data frames

I want to subset everything from df1 except df2.
df1<-
A B C D E F G H I J
80 16 55 74 89 39 4 67 36 87
69 49 91 83 50 1 77 19 73 43
85 45 97 9 47 65 79 81 86 66
37 58 17 38 76 14 54 78 62 98
12 25 56 20 31 82 34 23 33 11
df2<-
C D E F
55 74 89 39
91 83 50 1
97 9 47 65
17 38 76 14
56 20 31 82
I would like to utilise this kind of approach if possible:
mydata<-df1[,!colnames(df2)]
If you want the columns that are in df1, but not in df2, this can be done as such:
not_in_df2 <- setdiff(colnames(df1), colnames(df2))
subSet_df1 <- df1[,not_in_df2]
Or you could define not_in_df2 via
not_in_df2 <- !(colnames(df1) %in% colnames(df2))

In R, how do I locally shuffle a vector's elements

I have the following vector in R. Think of them as a vector of numbers.
x = c(1,2,3,4,...100)
I want to randomize this vector "locally" based on some input number the "locality factor". For example if the locality factor is 3, then the first 3 elements are taken and randomized followed by the next 3 elements and so on. Is there an efficient way to do this? I know if I use sample, it would jumble up the whole array.
Thanks in advance
Arun didn't like how inefficient my other answer was, so here's something very fast just for him ;)
It requires just one call each to runif() and order(), and doesn't use sample() at all.
x <- 1:100
k <- 3
n <- length(x)
x[order(rep(seq_len(ceiling(n/k)), each=k, length.out=n) + runif(n))]
# [1] 3 1 2 6 5 4 8 9 7 11 12 10 13 14 15 18 16 17
# [19] 20 19 21 23 22 24 27 25 26 29 28 30 33 31 32 36 34 35
# [37] 37 38 39 40 41 42 43 44 45 47 48 46 51 49 50 52 54 53
# [55] 55 57 56 58 60 59 62 63 61 66 64 65 68 67 69 71 70 72
# [73] 75 74 73 76 77 78 81 80 79 84 82 83 86 85 87 89 88 90
# [91] 93 92 91 94 96 95 97 98 99 100
General solution:
Edit: As #MatthewLundberg comments, the issue I pointed out with "repeating numbers in x" can be easily overcome by working on seq_along(x), which would mean the resulting values will be indices. So, it'd be like so:
k <- 3
x <- c(2,2,1, 1,3,4, 4,6,5, 3)
x.s <- seq_along(x)
y <- sample(x.s)
x[unlist(split(y, (match(y, x.s)-1) %/% k), use.names = FALSE)]
# [1] 2 2 1 3 4 1 4 5 6 3
Old answer:
The bottleneck here is the amount of calls to function sample. And as long as your numbers don't repeat, I think you can do this with just one call to sample in this manner:
k <- 3
x <- 1:20
y <- sample(x)
unlist(split(y, (match(y,x)-1) %/% k), use.names = FALSE)
# [1] 1 3 2 5 6 4 8 9 7 12 10 11 13 14 15 17 16 18 19 20
To put everything together in a function (I like the name scramble from #Roland's):
scramble <- function(x, k=3) {
x.s <- seq_along(x)
y.s <- sample(x.s)
idx <- unlist(split(y.s, (match(y.s, x.s)-1) %/% k), use.names = FALSE)
x[idx]
}
scramble(x, 3)
# [1] 2 1 2 3 4 1 5 4 6 3
scramble(x, 3)
# [1] 1 2 2 1 4 3 6 5 4 3
To reduce the answer (and get it faster) even more, following #flodel's comment:
scramble <- function(x, k=3L) {
x.s <- seq_along(x)
y.s <- sample(x.s)
x[unlist(split(x.s[y.s], (y.s-1) %/% k), use.names = FALSE)]
}
For the record, the boot package (shipped with base R) includes a function permutation.array() that is used for just this purpose:
x <- 1:100
k <- 3
ii <- boot:::permutation.array(n = length(x),
R = 2,
strata = (seq_along(x) - 1) %/% k)[1,]
x[ii]
# [1] 2 1 3 6 5 4 9 7 8 12 11 10 15 13 14 16 18 17
# [19] 21 19 20 23 22 24 26 27 25 28 29 30 33 31 32 36 35 34
# [37] 38 39 37 41 40 42 43 44 45 46 47 48 51 50 49 53 52 54
# [55] 57 55 56 59 60 58 63 61 62 65 66 64 67 69 68 72 71 70
# [73] 75 73 74 76 77 78 79 80 81 82 83 84 86 87 85 89 88 90
# [91] 93 91 92 94 95 96 97 98 99 100
This will drop elements at the end (with a warning):
locality <- 3
x <- 1:100
c(apply(matrix(x, nrow=locality, ncol=length(x) %/% locality), 2, sample))
## [1] 1 2 3 4 6 5 8 9 7 12 10 11 13 15 14 16 18 17 19 20 21 22 24 23 26 25 27 28 30 29 32 33 31 35 34 36 38 39 37
## [40] 42 40 41 43 44 45 47 48 46 51 49 50 54 52 53 55 57 56 58 59 60 62 61 63 64 65 66 67 69 68 71 72 70 74 75 73 78 77 76
## [79] 80 81 79 83 82 84 87 85 86 88 89 90 92 93 91 96 94 95 99 98 97
v <- 1:16
scramble <- function(vec,n) {
res <- tapply(vec,(seq_along(vec)+n-1)%/%n,
FUN=function(x) x[sample.int(length(x), size=length(x))])
unname(unlist(res))
}
set.seed(42)
scramble(v,3)
#[1] 3 2 1 6 5 4 9 7 8 12 10 11 15 13 14 16
scramble(v,4)
#[1] 2 3 1 4 5 8 6 7 10 12 9 11 14 15 16 13
I like Matthew's approach way better but here was the way I did the problem:
x <- 1:100
fact <- 3
y <- ceiling(length(x)/fact)
unlist(lapply(split(x, rep(1:y, each =fact)[1:length(x)]), function(x){
if (length(x)==1) return(x)
sample(x)
}), use.names = FALSE)
## [1] 3 1 2 6 4 5 8 9 7 11 10 12 13 15 14 17 16 18
## [19] 20 21 19 24 23 22 26 27 25 29 30 28 31 32 33 35 34 36
## [37] 39 37 38 41 42 40 45 43 44 47 46 48 51 49 50 52 53 54
## [55] 57 56 55 59 60 58 63 62 61 64 66 65 67 68 69 70 71 72
## [73] 75 73 74 77 76 78 80 79 81 82 84 83 85 86 87 90 89 88
## [91] 92 91 93 96 94 95 98 99 97 100

Resources