Substring each element of the data frame in R - r

I would like to use this code, which I found here: Generate N random integers that sum to M in R. I have a example data frame and I would like to use this function for each Value in data frame.
rand_vect <- function(N, M, sd = 1, pos.only = TRUE) {
vec <- rnorm(N, M/N, sd)
if (abs(sum(vec)) < 0.01) vec <- vec + 1
vec <- round(vec / sum(vec) * M)
deviation <- M - sum(vec)
for (. in seq_len(abs(deviation))) {
vec[i] <- vec[i <- sample(N, 1)] + sign(deviation)
}
if (pos.only) while (any(vec < 0)) {
negs <- vec < 0
pos <- vec > 0
vec[negs][i] <- vec[negs][i <- sample(sum(negs), 1)] + 1
vec[pos][i] <- vec[pos ][i <- sample(sum(pos ), 1)] - 1
}
vec
}
abc <- data.frame(Product = c("A", "B", "C"),
Value =c(33, 23, 12))

You can vectorize your function rand_vect for receiving a array of values
vrand_vect <- Vectorize(rand_vect,"M",SIMPLIFY = F)
which gives
res <- vrand_vect(N= 7, M = abc$Value)
> res
[[1]]
[1] 3 5 6 6 4 3 6
[[2]]
[1] 1 3 4 5 3 4 3
[[3]]
[1] 1 2 3 3 1 2 0

You can use sapply to run rand_vect with N = 7
sapply(abc$Value, rand_vect, N = 7)
# [,1] [,2] [,3]
#[1,] 4 4 0
#[2,] 5 2 2
#[3,] 5 4 2
#[4,] 4 3 3
#[5,] 5 5 3
#[6,] 6 2 1
#[7,] 4 3 1
This will give 7 random numbers which will sum upto Value.
We can verify it by
colSums(sapply(abc$Value, rand_vect, N = 7))
#[1] 33 23 12

Related

How to calculate the mean for every n vectors from a df

How to calculate the mean for every n vectors from a df creating a new data frame with the results.
I expect to get:
column 1: mean (V1,V2),
column 2: mean (V3,V4),
column 3: mean (V5,V6)
,and so forth
data
df <- data.frame(v1=1:6,V2=7:12,V3=13:18,v4=19:24,v5=25:30,v6=31:36)
Here is base R option
n <- 2 # Mean across every n = 2 columns
do.call(cbind, lapply(seq(1, ncol(df), by = n), function(idx) rowMeans(df[c(idx, idx + 1)])))
# [,1] [,2] [,3]
#[1,] 4 16 28
#[2,] 5 17 29
#[3,] 6 18 30
#[4,] 7 19 31
#[5,] 8 20 32
#[6,] 9 21 33
This returns a matrix rather than a data.frame (which makes more sense here since you're dealing with "all-numeric" data).
Explanation: The idea is a non-overlapping sliding window approach. seq(1, ncol(df), by = n) creates the start indices of the columns (here: 1, 3, 5). We then loop over those indices idx and calculate the row means of df[c(idx, idx + 1)]. This returns a list which we then cbind into a matrix.
As a minor modifcation, you can also predefine a data.frame with the right dimensions and then skip the do.call(cbind, ...) step by having R do an implicit list to data.frame typecast.
out <- data.frame(matrix(NA, ncol = ncol(df) / 2, nrow = nrow(df)))
out[] <- lapply(seq(1, ncol(df), by = n), function(idx) rowMeans(df[c(idx, idx + 1)]))
# X1 X2 X3
#1 4 16 28
#2 5 17 29
#3 6 18 30
#4 7 19 31
#5 8 20 32
#6 9 21 33
You may try,
dummy <- data.frame(
v1 = c(1:10),
v2 = c(1:10),
v3 = c(1:10),
v4 = c(1:10),
v5 = c(1:10),
v6 = c(1:10)
)
nvec_mean <- function(df, n){
res <- c()
m <- matrix(1:ncol(df), ncol = n, byrow = T)
if (ncol(df) %% n != 0){
stop()
}
for (i in 1:nrow(m)){
v <- rowMeans(df[,m[i,]])
res <- cbind(res, v)
}
colnames(res) <- c(1:nrow(m))
res
}
nvec_mean(dummy,3)
1 2
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 4 4
[5,] 5 5
[6,] 6 6
[7,] 7 7
[8,] 8 8
[9,] 9 9
[10,] 10 10
If you didn't want rowMeans or result is not what you wanted, please let me know.
Simple(?) version
df <- data.frame(v1=1:6,V2=7:12,V3=13:18,v4=19:24,v5=25:30,v6=31:36)
n = 2
res <- c()
m <- matrix(1:ncol(df), ncol = 2, byrow = T)
for (i in 1:nrow(m)){
v <- rowMeans(df[,m[i,]])
res <- cbind(res, v)
}
res
v v v
[1,] 4 16 28
[2,] 5 17 29
[3,] 6 18 30
[4,] 7 19 31
[5,] 8 20 32
[6,] 9 21 33

Combinations with restrictions in R

How to get all combinations(9,2) with the following restrictions:
n1 > 2,
n2 < 7,
n1 + n2 < 11.
There are 5 combinations in result: 3 4, 3 5, 3 6, 4 5, 4 6?
Or without any apply function
comb <- t(combn(9,2))
comb[comb[, 1] > 2 & comb[, 2] < 7 & comb[, 1] + comb[, 2] < 11, ]
I did this:
library(gtools)
a= combinations(9,2)
a =apply(a, 1, function(i){
if(i[1]>2&i[2]<7&(i[1]+i[2])<11){return(i)}
})
a= matrix(unlist(a), ncol = 2,byrow = T)
the output:
>a
[,1] [,2]
[1,] 3 4
[2,] 3 5
[3,] 3 6
[4,] 4 5
[5,] 4 6
Let me know if it was what you wanted
To find all those combinations I use combn which comes as standard with R and does not need any additional packages.
For filtering I use apply with an anonymous filtering function and which to find which combinations fullfill the filter criterion:
comb <- combn(9,2)
n <- apply(comb, 2, \(x) x[1] > 2 & x[2] < 7 & sum(x) < 11) |>
which()
comb[, n]
will yield
> comb[, n]
[,1] [,2] [,3] [,4] [,5]
[1,] 3 3 3 4 4
[2,] 4 5 6 5 6
Try the code below
Filter(
length,
combn(9,
2,
FUN = function(x) if (x[1] > 2 & x[2] < 7 & sum(x) < 11) list(x) else list(NULL)
)
)
which gives
[[1]]
[1] 3 4
[[2]]
[1] 3 5
[[3]]
[1] 3 6
[[4]]
[1] 4 5
[[5]]
[1] 4 6

How to understand a specific function in R

Here I have a simple function in R below it is:
no.dimnames <- function(a) {
## Remove all dimension names from an array for compact printing.
d <- list()
l <- 0
for(i in dim(a)) {
d[[l <- l + 1]] <- rep("", i)
}
dimnames(a) <- d
a
}
The goal for this function is to drop all array names. However, I dont know what does the following indexing do.
d[[l <- l + 1]]
In this case, d is a null list initially, and l<- 0 so then d[[0<- 1]] implies what?
> x <- matrix(sample(1:5,20,replace=TRUE),nrow = 5)
> x
[,1] [,2] [,3] [,4]
[1,] 5 4 5 3
[2,] 2 1 5 1
[3,] 1 3 4 4
[4,] 3 1 4 3
[5,] 5 3 5 5
> no.dimnames(x)
5 4 5 3
2 1 5 1
1 3 4 4
3 1 4 3
5 3 5 5
It looks like you understand the increment code d[[l <- l + 1]] but are still asking about the empty spaces rep("", i). They are replacing the dimension names with blanks. The i is used to indicate the amount of spaces that are needed.
If we had a 4x5 matrix. We would have four row names and five column names. To make them all blank, we would need four spaces in rows rep("", 4) and five in columns rep("", 5). The code aims to accomplish that:
mat <- matrix(1:20, 4,5)
rownames(mat) <- month.abb[1:4]
colnames(mat) <- letters[1:5]
mat
# a b c d e
# Jan 1 5 9 13 17
# Feb 2 6 10 14 18
# Mar 3 7 11 15 19
# Apr 4 8 12 16 20
dimnames(mat)
# [[1]]
# [1] "Jan" "Feb" "Mar" "Apr"
#
# [[2]]
# [1] "a" "b" "c" "d" "e"
#What we need
list(rep("", 4), rep("", 5))
# [[1]]
# [1] "" "" "" ""
#
# [[2]]
# [1] "" "" "" "" ""
dimnames(mat) <- list(rep("", 4), rep("", 5))
mat
#
# 1 5 9 13 17
# 2 6 10 14 18
# 3 7 11 15 19
# 4 8 12 16 20
d[[0<- 1]] isn't valid... you are saying set 0 to 1 which can't be done. In this case l is being set to l + 1 where it is initially 0 so it is l <- 0 + 1
Forget about what a is or what it could be.
Just type this into Rstudio or w/e you are using you will see what happens if you check each variable.
> d <- list()
> l <- 0
> d[[l <- l + 1]] <- rep("", 1)
> d
> l
The one part I should explain is in this case when you type the d[[l <- l + 1]] it is assigning l + 1 to l and then using l as the parameter for the [[]].
So the d[[l <- l + 1]] breaks down to this...
l <- l + 1
l <- 0 + 1
l <- 1
[l]
[[l]]
d[[l]]
d[[1]]

Averaging column values of a matrix in R

I have a matrix of 7000 rows x 160 columns, i want to take the average of 20 values in each row to make it 1 value i.e. column avg(1:20) = 1st new value, avg(21:40) = 2nd new value ..... avg(141:160) 8th and last new value for row 1, will do the same foe all rows, at the end my matrix will be 7000 x 8 ie. 160/20 = 8. what is the fastest way to archive this in R?
eg. at the end
1st 2 4 5 6 7 7 9 4
2nd 3 6 5 3 6 7 4 3
...............
7000th 5 6 7 4 5 6 7 6
i tried this, it works but too slow!
res <- matrix(T, nrow = 7000, ncol = 8)
for (i in 1:nrow(m)){
s <- 0
k <- 1
for (j in 1:ncol(m)){
s <- s + m[i,j]
if(j %% 20 == 0){
a <- s/20
res[i,k] <- a
k <- k + 1
s <- 0
}
}
}
Thank you.
# example - you have this already
set.seed(1) # for reproducible example
M <- matrix(rnorm(7000*160),nc=160)
# you start here...
indx <- rep(1:8,each=20)
result <- sapply(1:8,function(i)rowMeans(M[,which(indx==i)]))
head(result)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
# [1,] 0.2127915 -0.38038950 -0.087656347 0.05933375 -0.23819112 0.03943897 -0.008970226 0.03841767
# [2,] 0.3548025 0.31491967 0.144773998 -0.05972595 -0.17191220 0.04243383 0.047314127 -0.16848104
# [3,] -0.2559990 0.35942642 0.003344486 0.23424747 0.09022379 0.58685507 -0.157652263 -0.25611335
# [4,] 0.3723693 0.23901787 -0.304657019 0.41620451 0.26005406 0.09726225 -0.434833656 0.07112657
# [5,] 0.4457805 0.08682639 0.048011727 0.15753612 0.30271061 -0.05484104 -0.103921787 -0.12066903
# [6,] -0.2823111 -0.00243217 0.055399402 0.31365508 0.17940294 0.26896135 -0.439110424 -0.30403590
Another way using rowsum and jlhoward's data ("M"):
n = 20
ans = t(rowsum(t(M), rep(seq_len(ncol(M) / n), each = n))) / n
head(ans)

Generalize R %in% operator to match tuples

I spent a while the other day looking for a way to check if a row vector is contained in some set of row vectors in R. Basically, I want to generalize the %in% operator to match a tuple instead of each entry in a vector. For example, I want:
row.vec = c("A", 3)
row.vec
# [1] "A" "3"
data.set = rbind(c("A",1),c("B",3),c("C",2))
data.set
# [,1] [,2]
# [1,] "A" "1"
# [2,] "B" "3"
# [3,] "C" "2"
row.vec %tuple.in% data.set
# [1] FALSE
for my made-up operator %tuple.in% because the row vector c("A",3) is not a row vector in data.set. Using the %in% operator gives:
row.vec %in% data.set
# [1] TRUE TRUE
because "A" and 3 are in data.set, which is not what I want.
I have two questions. First, are there any good existing solutions to this?
Second, since I couldn't find them (even if they exist), I tried to write my own function to do it. It works for an input matrix of row vectors, but I'm wondering if any experts have proposed improvements:
is.tuple.in <- function(matrix1, matrix2){
# Apply rbind() so that matrix1 has columns even if it is a row vector.
matrix1 = rbind(matrix1)
if(ncol(matrix1) != ncol(matrix2)){
stop("Matrices must have the same number of columns.") }
# Now check for the first row and handle other rows recursively
row.vec = matrix1[1,]
tuple.found = FALSE
for(i in 1:nrow(matrix2)){
# If we find a match, then this row exists in matrix 2 and we can break the loop
if(all(row.vec == matrix2[i,])){
tuple.found = TRUE
break
}
}
# If there are more rows to be checked, use a recursive call
if(nrow(matrix1) > 1){
return(c(tuple.found, is.tuple.in(matrix1[2:nrow(matrix1),],matrix2)))
} else {
return(tuple.found)
}
}
I see a couple problems with that that I'm not sure how to fix. First, I'd like the base case to be clear at the start of the function. I didn't manage to do this because I pass matrix1[2:nrow(matrix1),] in the recursive call, which produces an error if matrix1 has one row. So instead of getting to a case where matrix1 is empty, I have an if condition at the end deciding if more iterations are necessary.
Second, I think the use of rbind() at the start is sloppy, but I needed it for when matrix1 had been reduced to a single row. Without using rbind(), ncol(matrix1) produced an error in the 1-row case. I figure my trouble here has to do with a lack of knowledge about R data types.
Any help would be appreciated.
I'm wondering if you have made this a bit more complicated than it is. For example,
set.seed(1618)
vec <- c(1,3)
mat <- matrix(rpois(1000,3), ncol = 2)
rownames(mat) <- 1:nrow(mat)
mat[sapply(1:nrow(mat), function(x) all(vec %in% mat[x, ])), ]
# gives me this
# [,1] [,2]
# 6 3 1
# 38 3 1
# 39 3 1
# 85 1 3
# 88 1 3
# 89 1 3
# 95 3 1
# 113 1 3
# ...
you could subset this further if you care about the order
or you could modify the function slightly:
mat[sapply(1:nrow(mat), function(x)
all(paste(vec, collapse = '') %in% paste(mat[x, ], collapse = ''))), ]
# [,1] [,2]
# 85 1 3
# 88 1 3
# 89 1 3
# 113 1 3
# 133 1 3
# 139 1 3
# 187 1 3
# ...
another example with a longer vector
set.seed(1618)
vec <- c(1,4,5,2)
mat <- matrix(rpois(10000, 3), ncol = 4)
rownames(mat) <- 1:nrow(mat)
mat[sapply(1:nrow(mat), function(x) all(vec %in% mat[x, ])), ]
# [,1] [,2] [,3] [,4]
# 57 2 5 1 4
# 147 1 5 2 4
# 279 1 2 5 4
# 303 1 5 2 4
# 437 1 5 4 2
# 443 1 4 5 2
# 580 5 4 2 1
# ...
I see a couple that match:
mat[sapply(1:nrow(mat), function(x)
all(paste(vec, collapse = '') %in% paste(mat[x, ], collapse = ''))), ]
# [,1] [,2] [,3] [,4]
# 443 1 4 5 2
# 901 1 4 5 2
# 1047 1 4 5 2
but only three
for your single row case:
vec <- c(1,4,5,2)
mat <- matrix(c(1,4,5,2), ncol = 4)
rownames(mat) <- 1:nrow(mat)
mat[sapply(1:nrow(mat), function(x)
all(paste(vec, collapse = '') %in% paste(mat[x, ], collapse = ''))), ]
# [1] 1 4 5 2
here is a simple function with the above code
is.tuplein <- function(vec, mat, exact = TRUE) {
rownames(mat) <- 1:nrow(mat)
if (exact)
tmp <- mat[sapply(1:nrow(mat), function(x)
all(paste(vec, collapse = '') %in% paste(mat[x, ], collapse = ''))), ]
else tmp <- mat[sapply(1:nrow(mat), function(x) all(vec %in% mat[x, ])), ]
return(tmp)
}
is.tuplein(vec = vec, mat = mat)
# [1] 1 4 5 2
seems to work, so let's make our own %in% operator:
`%tuple%` <- function(x, y) is.tuplein(vec = x, mat = y, exact = TRUE)
`%tuple1%` <- function(x, y) is.tuplein(vec = x, mat = y, exact = FALSE)
and try her out
set.seed(1618)
c(1,2,3) %tuple% matrix(rpois(1002,3), ncol = 3)
# [,1] [,2] [,3]
# 133 1 2 3
# 190 1 2 3
# 321 1 2 3
set.seed(1618)
c(1,2,3) %tuple1% matrix(rpois(1002,3), ncol = 3)
# [,1] [,2] [,3]
# 48 2 3 1
# 64 2 3 1
# 71 1 3 2
# 73 3 1 2
# 108 3 1 2
# 112 1 3 2
# 133 1 2 3
# 166 2 1 3
Does this do what you want (even for more than 2 columns)?
paste(row.vec,collapse="_") %in% apply(data.set,1,paste,collapse="_")

Resources