I would like to create new variables by dividing all pre-existing variables by each other
e.g.
X1/X1, X1/X2, X1/X3, X1/X4, X1/X5, X1/X6, X1/X7, X1/X8, X1/X9, X1/X10,
X2/X1, X2/X2, X2/X3, X2/X4, X2/X5, X2/X6, X2/X7, X2/X8, X2/X9, X2/X10,
X3/X1, X3/X2 ...
I started by trying to do each individually, as below, but I need to replicate this with multiple variable names so an automation (I assume a function/lapply) would be ideal.
ds$rom_3_5m <- (ds$roll_open_mean_3m/ds$roll_open_mean_5m)
ds$rom_3_10m <- (ds$roll_open_mean_3m/ds$roll_open_mean_10m)
ds$rom_3_15m <- (ds$roll_open_mean_3m/ds$roll_open_mean_15m)
ds$rom_3_30m <- (ds$roll_open_mean_3m/ds$roll_open_mean_30m)
ds$rom_3_60m <- (ds$roll_open_mean_3m/ds$roll_open_mean_60m)
ds$rom_3_120m <- (ds$roll_open_mean_3m/ds$roll_open_mean_120m)
ds$rom_3_240m <- (ds$roll_open_mean_3m/ds$roll_open_mean_240m)
ds$rom_3_480m <- (ds$roll_open_mean_3m/ds$roll_open_mean_480m)
ds$rom_3_960m <- (ds$roll_open_mean_3m/ds$roll_open_mean_960m)
ds$rom_3_1920m <- (ds$roll_open_mean_3m/ds$roll_open_mean_1920m)
ds$rom_3_3840m <- (ds$roll_open_mean_3m/ds$roll_open_mean_3840m)
ds$rom_3_7680m <- (ds$roll_open_mean_3m/ds$roll_open_mean_7680m)
ds$rom_3_15360m <- (ds$roll_open_mean_3m/ds$roll_open_mean_15360m)
ds$rom_3_30720m <- (ds$roll_open_mean_3m/ds$roll_open_mean_30720m)
ds$rom_3_61440m <- (ds$roll_open_mean_3m/ds$roll_open_mean_61440m)
ds$rom_3_122880m <- (ds$roll_open_mean_3m/ds$roll_open_mean_122880m)
ds$rom_3_245760m <- (ds$roll_open_mean_3m/ds$roll_open_mean_245760m)
ds$rom_3_491520m <- (ds$roll_open_mean_3m/ds$roll_open_mean_491520m)
#5m
ds$rom_5_3m <- (ds$roll_open_mean_5m/ds$roll_open_mean_3m)
ds$rom_5_10m <- (ds$roll_open_mean_5m/ds$roll_open_mean_10m)
ds$rom_5_15m <- (ds$roll_open_mean_5m/ds$roll_open_mean_15m)
ds$rom_5_30m <- (ds$roll_open_mean_5m/ds$roll_open_mean_30m)
ds$rom_5_60m <- (ds$roll_open_mean_5m/ds$roll_open_mean_60m)
ds$rom_5_120m <- (ds$roll_open_mean_5m/ds$roll_open_mean_120m)
ds$rom_5_240m <- (ds$roll_open_mean_5m/ds$roll_open_mean_240m)
ds$rom_5_480m <- (ds$roll_open_mean_5m/ds$roll_open_mean_480m)
ds$rom_5_960m <- (ds$roll_open_mean_5m/ds$roll_open_mean_960m)
ds$rom_5_1920m <- (ds$roll_open_mean_5m/ds$roll_open_mean_1920m)
ds$rom_5_3840m <- (ds$roll_open_mean_5m/ds$roll_open_mean_3840m)
ds$rom_5_7680m <- (ds$roll_open_mean_5m/ds$roll_open_mean_7680m)
ds$rom_5_15360m <- (ds$roll_open_mean_5m/ds$roll_open_mean_15360m)
ds$rom_5_30720m <- (ds$roll_open_mean_5m/ds$roll_open_mean_30720m)
ds$rom_5_61440m <- (ds$roll_open_mean_5m/ds$roll_open_mean_61440m)
ds$rom_5_122880m <- (ds$roll_open_mean_5m/ds$roll_open_mean_122880m)
ds$rom_5_245760m <- (ds$roll_open_mean_5m/ds$roll_open_mean_245760m)
ds$rom_5_491520m <- (ds$roll_open_mean_5m/ds$roll_open_mean_491520m)
#10m
ds$rom_10_3m <- (ds$roll_open_mean_10m/ds$roll_open_mean_3m)
ds$rom_10_5m <- (ds$roll_open_mean_10m/ds$roll_open_mean_5m)
ds$rom_10_15m <- (ds$roll_open_mean_10m/ds$roll_open_mean_15m)
I have a data frame with 40+ variables with 6 million rows, I have attached a smaller example data frame below.
Thanks in advance!
Charlie
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 57 77 48 8 31 43 47 13 26 88
2 25 75 86 77 4 65 5 49 31 57
3 91 90 42 69 82 33 56 99 47 39
4 35 96 86 77 67 77 20 17 77 92
5 6 100 50 62 16 31 0 39 72 4
6 90 34 74 89 71 37 73 45 24 28
7 24 22 92 13 57 97 32 2 12 80
8 74 59 49 2 97 100 15 37 15 67
9 43 38 66 97 8 20 85 25 97 67
10 82 4 56 40 42 46 44 98 98 76
11 60 68 92 99 81 92 78 59 23 81
12 22 57 37 100 7 1 89 41 40 56
13 69 13 1 82 89 45 83 24 71 29
14 8 14 66 48 94 8 20 3 28 63
15 26 70 56 62 9 34 11 86 71 64
16 7 55 15 100 91 89 46 74 98 14
17 29 68 19 66 83 29 84 76 90 45
18 27 76 6 48 17 28 8 7 52 37
19 68 58 51 75 60 57 74 46 98 93
20 15 15 89 55 23 3 3 8 32 37
21 78 49 57 48 96 89 4 95 67 58
22 12 36 42 59 27 92 48 0 92 28
23 51 17 77 61 84 53 46 22 27 36
24 40 84 83 35 19 13 80 78 96 87
25 44 80 25 72 43 17 74 70 52 36
26 14 61 63 82 16 47 32 93 19 84
27 93 19 28 62 74 1 85 65 50 9
28 80 62 6 58 48 97 97 18 65 43
29 12 58 95 79 37 89 89 83 22 85
30 57 73 22 88 99 63 58 87 90 66
As #27 ϕ 9 suggested in the comments you should use that lapply solution.
With this, you also create a unique dataframe with correct names
l <- lapply(df, `/`, df)
l <- unlist(l, recursive = FALSE)
data.frame(l)
This question already has answers here:
Split a vector into chunks
(22 answers)
Closed 3 years ago.
How to cut the values (1 to 100) in a regular interval (25) and place them into 4 groups as below:
sdr <- c(1:100)
Group1: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
Group2: 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
Group3: 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
Group4: 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Any suggestion, please.
You could use split
sdr <- 1:100
split(sdr, rep(1:4, each = 25))
#$`1`
# [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#
#$`2`
# [1] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#
#$`3`
# [1] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#
#$`4`
# [1] 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#[20] 95 96 97 98 99 100
This returns a list with 4 vector elements.
Also note that the c() around 1:100 is not necessary.
Or we can define the number of groups
ngroup <- 4
split(sdr, rep(1:ngroup, each = length(sdr) %/% ngroup))
giving the same result.
You can make a dataframe for your groups and then transpose using t:
df <- t(data.frame(Group1 = c(1:25), Group2 = c(26:50), Group3 = c(51:75), Group4 = c(76:100)))
Suppose I have a matrix or a data frame and I want only those values that are greater than 15 and no values between 85 and 90 both inclusive
a<-matrix(1:100,nrow = 10, ncol = 10)
rownames(a) <- LETTERS[1:10]
colnames(a) <- LETTERS[1:10]
A B C D E F G H I J
A 1 11 21 31 41 51 61 71 81 91
B 2 12 22 32 42 52 62 72 82 92
C 3 13 23 33 43 53 63 73 83 93
D 4 14 24 34 44 54 64 74 84 94
E 5 15 25 35 45 55 65 75 85 95
F 6 16 26 36 46 56 66 76 86 96
G 7 17 27 37 47 57 67 77 87 97
H 8 18 28 38 48 58 68 78 88 98
I 9 19 29 39 49 59 69 79 89 99
J 10 20 30 40 50 60 70 80 90 100
Note: You can convert it into dataframe if you know this kind of operation is possible in dataframe
Now I want My result in such a format that only those values that are greater than 5 and less than 85 retain and all else got deleted and replaced with blank space.
My desired out is like below
A B C D E F G H I J
A 11 21 31 41 51 61 71 81 91
B 12 22 32 42 52 62 72 82 92
C 13 23 33 43 53 63 73 83 93
D 14 24 34 44 54 64 74 84 94
E 5 15 25 35 45 55 65 75 85 95
F 6 16 26 36 46 56 66 76 96
G 7 17 27 37 47 57 67 77 97
H 8 18 28 38 48 58 68 78 98
I 9 19 29 39 49 59 69 79 99
J 10 20 30 40 50 60 70 80 100
Is there any kind of function in R which can take my condition and produce the desired result. I want to change code according to problem . I searched it over stack flow but didn't find something like this. I don't want to format based on rows or column.
I tried
a[a> 5 & a!=c(85:90)]
but this give me values and looses the structure.
Assuming that the 'a' is matrix, we can assign the values of 'a' %in% 86:90 or | less than 5 (a < 5) to NA. Here, I am not assigning it to '' as it will change the class from numeric to character. Also, assigning to NA would be useful for later processing.
a[a %in% 86:90 | a<5] <- NA
However, if we need it to be ''
a[a %in% 86:90 | a<5] <- ""
If we are using a data.frame
a1 <- as.data.frame(a)
a1[] <- lapply(a1, function(x) replace(x, x %in% 86:90| x <5, ""))
a1
# A B C D E F G H I J
#A 11 21 31 41 51 61 71 81 91
#B 12 22 32 42 52 62 72 82 92
#C 13 23 33 43 53 63 73 83 93
#D 14 24 34 44 54 64 74 84 94
#E 5 15 25 35 45 55 65 75 85 95
#F 6 16 26 36 46 56 66 76 96
#G 7 17 27 37 47 57 67 77 97
#H 8 18 28 38 48 58 68 78 98
#I 9 19 29 39 49 59 69 79 99
#J 10 20 30 40 50 60 70 80 100
NOTE: This changes the class of each column to character
In the OP's code, a!=c(85:90) will not work as intended as the 85:90 will recycle to the length of the 'a' and the comparison will be between the corresponding values in the recycled value and 'a'. Instead, we need to use %in% for a vector with length > 1.
I want to subset everything from df1 except df2.
df1<-
A B C D E F G H I J
80 16 55 74 89 39 4 67 36 87
69 49 91 83 50 1 77 19 73 43
85 45 97 9 47 65 79 81 86 66
37 58 17 38 76 14 54 78 62 98
12 25 56 20 31 82 34 23 33 11
df2<-
C D E F
55 74 89 39
91 83 50 1
97 9 47 65
17 38 76 14
56 20 31 82
I would like to utilise this kind of approach if possible:
mydata<-df1[,!colnames(df2)]
If you want the columns that are in df1, but not in df2, this can be done as such:
not_in_df2 <- setdiff(colnames(df1), colnames(df2))
subSet_df1 <- df1[,not_in_df2]
Or you could define not_in_df2 via
not_in_df2 <- !(colnames(df1) %in% colnames(df2))
I have the following vector in R. Think of them as a vector of numbers.
x = c(1,2,3,4,...100)
I want to randomize this vector "locally" based on some input number the "locality factor". For example if the locality factor is 3, then the first 3 elements are taken and randomized followed by the next 3 elements and so on. Is there an efficient way to do this? I know if I use sample, it would jumble up the whole array.
Thanks in advance
Arun didn't like how inefficient my other answer was, so here's something very fast just for him ;)
It requires just one call each to runif() and order(), and doesn't use sample() at all.
x <- 1:100
k <- 3
n <- length(x)
x[order(rep(seq_len(ceiling(n/k)), each=k, length.out=n) + runif(n))]
# [1] 3 1 2 6 5 4 8 9 7 11 12 10 13 14 15 18 16 17
# [19] 20 19 21 23 22 24 27 25 26 29 28 30 33 31 32 36 34 35
# [37] 37 38 39 40 41 42 43 44 45 47 48 46 51 49 50 52 54 53
# [55] 55 57 56 58 60 59 62 63 61 66 64 65 68 67 69 71 70 72
# [73] 75 74 73 76 77 78 81 80 79 84 82 83 86 85 87 89 88 90
# [91] 93 92 91 94 96 95 97 98 99 100
General solution:
Edit: As #MatthewLundberg comments, the issue I pointed out with "repeating numbers in x" can be easily overcome by working on seq_along(x), which would mean the resulting values will be indices. So, it'd be like so:
k <- 3
x <- c(2,2,1, 1,3,4, 4,6,5, 3)
x.s <- seq_along(x)
y <- sample(x.s)
x[unlist(split(y, (match(y, x.s)-1) %/% k), use.names = FALSE)]
# [1] 2 2 1 3 4 1 4 5 6 3
Old answer:
The bottleneck here is the amount of calls to function sample. And as long as your numbers don't repeat, I think you can do this with just one call to sample in this manner:
k <- 3
x <- 1:20
y <- sample(x)
unlist(split(y, (match(y,x)-1) %/% k), use.names = FALSE)
# [1] 1 3 2 5 6 4 8 9 7 12 10 11 13 14 15 17 16 18 19 20
To put everything together in a function (I like the name scramble from #Roland's):
scramble <- function(x, k=3) {
x.s <- seq_along(x)
y.s <- sample(x.s)
idx <- unlist(split(y.s, (match(y.s, x.s)-1) %/% k), use.names = FALSE)
x[idx]
}
scramble(x, 3)
# [1] 2 1 2 3 4 1 5 4 6 3
scramble(x, 3)
# [1] 1 2 2 1 4 3 6 5 4 3
To reduce the answer (and get it faster) even more, following #flodel's comment:
scramble <- function(x, k=3L) {
x.s <- seq_along(x)
y.s <- sample(x.s)
x[unlist(split(x.s[y.s], (y.s-1) %/% k), use.names = FALSE)]
}
For the record, the boot package (shipped with base R) includes a function permutation.array() that is used for just this purpose:
x <- 1:100
k <- 3
ii <- boot:::permutation.array(n = length(x),
R = 2,
strata = (seq_along(x) - 1) %/% k)[1,]
x[ii]
# [1] 2 1 3 6 5 4 9 7 8 12 11 10 15 13 14 16 18 17
# [19] 21 19 20 23 22 24 26 27 25 28 29 30 33 31 32 36 35 34
# [37] 38 39 37 41 40 42 43 44 45 46 47 48 51 50 49 53 52 54
# [55] 57 55 56 59 60 58 63 61 62 65 66 64 67 69 68 72 71 70
# [73] 75 73 74 76 77 78 79 80 81 82 83 84 86 87 85 89 88 90
# [91] 93 91 92 94 95 96 97 98 99 100
This will drop elements at the end (with a warning):
locality <- 3
x <- 1:100
c(apply(matrix(x, nrow=locality, ncol=length(x) %/% locality), 2, sample))
## [1] 1 2 3 4 6 5 8 9 7 12 10 11 13 15 14 16 18 17 19 20 21 22 24 23 26 25 27 28 30 29 32 33 31 35 34 36 38 39 37
## [40] 42 40 41 43 44 45 47 48 46 51 49 50 54 52 53 55 57 56 58 59 60 62 61 63 64 65 66 67 69 68 71 72 70 74 75 73 78 77 76
## [79] 80 81 79 83 82 84 87 85 86 88 89 90 92 93 91 96 94 95 99 98 97
v <- 1:16
scramble <- function(vec,n) {
res <- tapply(vec,(seq_along(vec)+n-1)%/%n,
FUN=function(x) x[sample.int(length(x), size=length(x))])
unname(unlist(res))
}
set.seed(42)
scramble(v,3)
#[1] 3 2 1 6 5 4 9 7 8 12 10 11 15 13 14 16
scramble(v,4)
#[1] 2 3 1 4 5 8 6 7 10 12 9 11 14 15 16 13
I like Matthew's approach way better but here was the way I did the problem:
x <- 1:100
fact <- 3
y <- ceiling(length(x)/fact)
unlist(lapply(split(x, rep(1:y, each =fact)[1:length(x)]), function(x){
if (length(x)==1) return(x)
sample(x)
}), use.names = FALSE)
## [1] 3 1 2 6 4 5 8 9 7 11 10 12 13 15 14 17 16 18
## [19] 20 21 19 24 23 22 26 27 25 29 30 28 31 32 33 35 34 36
## [37] 39 37 38 41 42 40 45 43 44 47 46 48 51 49 50 52 53 54
## [55] 57 56 55 59 60 58 63 62 61 64 66 65 67 68 69 70 71 72
## [73] 75 73 74 77 76 78 80 79 81 82 84 83 85 86 87 90 89 88
## [91] 92 91 93 96 94 95 98 99 97 100