Having a data frame like this:
df <- data.frame(a=c(31, 18, 0, 1, 20, 2),
b=c(1, 0, 0, 3, 1, 1),
c=c(12, 0, 9, 8, 10, 3))
> df
a b c
1 31 1 12
2 18 0 0
3 0 0 9
4 1 3 8
5 20 1 10
6 2 1 3
How can I do a random subset so the sum of rows and columns is equal to a value, i.e , 100?
As I understand your question, you're trying to sample a subset of the rows and columns of your matrix so that they sum to a target value.
You can use integer optimization to accomplish this. You'll have a binary decision variable for each row, column, and cell, and constraints to force the cell values to be equal to the product of the row and column values. I'll use the lpSolve package to do this, because it has a convenient mechanism to get multiple optimal solutions. We can then use the sample function to select between them:
library(lpSolve)
get.subset <- function(dat, target) {
nr <- nrow(dat)
nc <- ncol(dat)
nvar <- nr + nc + nr*nc
# Cells upper bounded by row and column variable values (r and c) and lower bounded by r+c-1
mat <- as.matrix(do.call(rbind, apply(expand.grid(seq(nr), seq(nc)), 1, function(x) {
r <- x[1]
c <- x[2]
pos <- nr + nc + (r-1)*nc + c
ltc <- rep(0, nvar)
ltc[nr + c] <- 1
ltc[pos] <- -1
ltr <- rep(0, nvar)
ltr[r] <- 1
ltr[pos] <- -1
gtrc <- rep(0, nvar)
gtrc[nr + c] <- 1
gtrc[r] <- 1
gtrc[pos] <- -1
return(as.data.frame(rbind(ltc, ltr, gtrc)))
})))
dir <- rep(c(">=", ">=", "<="), nr*nc)
rhs <- rep(c(0, 0, 1), nr*nc)
# Sum of selected cells equals target
mat <- rbind(mat, c(rep(0, nr+nc), as.vector(t(dat))))
dir <- c(dir, "=")
rhs <- c(rhs, target)
res <- lp(objective.in=rep(0, nvar), # Feasibility problem
const.mat=mat,
const.dir=dir,
const.rhs=rhs,
all.bin=TRUE,
num.bin.solns=100 # Number of feasible solutions to get
)
if (res$status != 0) {
return(list(rows=NA, cols=NA, subset=NA, num.sol=0))
}
sol.num <- sample(res$num.bin.solns, 1)
vals <- res$solution[seq((sol.num-1)*nvar+1, sol.num*nvar)]
rows <- which(vals[seq(nr)] >= 0.999)
cols <- which(vals[seq(nr+1, nr+nc)] >= 0.999)
return(list(rows=rows, cols=cols, subset=dat[rows,cols], num.sol=res$num.bin.solns))
}
The function returns the number of subset with that sum and returns the randomly selected subset:
set.seed(144)
get.subset(df, 1)
# $rows
# [1] 1
# $cols
# [1] 2
# $subset
# [1] 1
# $num.sol
# [1] 14
get.subset(df, 100)
# $rows
# [1] 1 2 4 5
# $cols
# [1] 1 3
# $subset
# a c
# 1 31 12
# 2 18 0
# 4 1 8
# 5 20 10
# $num.sol
# [1] 2
get.subset(df, 10000)
# $rows
# [1] NA
# $cols
# [1] NA
# $subset
# [1] NA
# $num.sol
# [1] 0
Related
R data frame 1 :
Index
Powervalue
0
1
1
2
2
4
3
8
4
16
5
32
R dataframe 2 :
CombinedValue
20
50
Expected Final Result :
Can we get the output as in the image. If yes please help.
One of stackoverflow mate provided below code. Am looking how to seperate , values as columns with 1 and 0.
df <- data.frame(sum = c(50, 20, 6))
values_list <- list()
for (i in 1:nrow(df)) {
sum <- df$sum[i]
values <- c()
while (sum > 0) {
value <- 2^floor(log2(sum))
values <- c(values, value)
sum <- sum - value
}
values_list[[i]] <- values
}
df$values <- values_list
Can we fix columns till power 31 as shown in attached image. The columns match with possiblecodes then place 1 and 0 else 0 for the remaining columns. Please help.
Here is a function whose output matches the expected output.
toCodes <- function(x) {
n <- floor(log2(x))
pow <- rev(seq.int(max(n)))
# 'y' is the matrix of codes
y <- t(sapply(x, \(.x) (.x %/% 2^pow) %% 2L))
i_cols <- apply(y, 2, \(.y) any(.y != 0L))
colnames(y) <- sprintf("code_%d", 2^pow)
#
possiblecodes <- apply(y, 1, \(p) {
codes <- 2^pow[as.logical(p)]
paste(rev(codes), collapse = ",")
})
data.frame(combinedvalue = x, possiblecodes, y[, i_cols])
}
x <- c(20L, 50L)
toCodes(x)
#> combinedvalue possiblecodes code_32 code_16 code_4 code_2
#> 1 20 4,16 0 1 1 0
#> 2 50 2,16,32 1 1 0 1
Created on 2022-12-19 with reprex v2.0.2
appreciate your guidance as im new to R programme. basically i've created a function to check whether the value is even or odd.
i wish to create a new result column whereby 'even' results in the original value * 2, and 'odd' results in the original value - 5.
not sure where i've gone wrong with the second part of the code but i am trying to figure out where can i include my 'check' column in the second function to specify it should be checked for even or odd.
i only learnt about ifelse(check(df$check) but it doesnt seem to work in my instance.
much appreciated!
## print 'odd' or 'even' results in df
check = function(df,col){
df['check'] =
ifelse(df[,col] %% 2 ==0, 'even', 'odd')
return(df)
}
# multiplication and subtraction for odd_even results
checkresult = function(df,col){
df['res'] =
ifelse(check(df) == 'even', df[,col] * 2, df[,col]-5)
return(df)
}
checkresult(df)
The simplest way to do it is by not implementing a new function, just use ifelse() as it was intended:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
df$res <- ifelse(df$x %% 2 == 0, df$x * 2, df$x - 5)
df |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
If you need to implement a function that returns a dataframe:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
Edit 1
If you want the parity of the number, one extra line is required:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$parity <- ifelse(vec %% 2 == 0, "EVEN", "ODD")
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x parity res
#> 1 10 EVEN 20
#> 2 7 ODD 2
#> 3 6 EVEN 12
#> 4 3 ODD -2
#> 5 9 ODD 4
#> 6 10 EVEN 20
Problem Statement
Let's say you have the following data:
df <- data.frame(x = rep(0, 10),
batch = rep(1:3,c(4,2,4)))
x batch
1 0 1
2 0 1
3 0 1
4 0 1
5 0 2
6 0 2
7 0 3
8 0 3
9 0 3
10 0 3
You want to loop over the number of unique batches in your dataset and within each batch, apply an algorithm to generate a vector of 1's and 0's. The algorithm is quite long, so for example's sake, let's say it's a random sample:
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
}
You then want to save out_x into the correct indices in df$x. My current rudimentary approach is to explicitly specify indices:
idxb <- 1
idxe <- length(df[which(df$batch == 1),]$batch)
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
print(out_x)
#save output
df$x[idxb:idxe] <- out_x
#update indices
idxb <- idxb + length(out_X)
if(i < length(unique(df$batch))) {
idxe <- idxe + length(df[which(df$batch == i+1),]$batch)
}
}
Output
The result should look like this:
x batch
1 0 1
2 1 1
3 1 1
4 0 1
5 1 2
6 1 2
7 1 3
8 0 3
9 1 3
10 1 3
where each iteration of out_x looks like this:
[1] 0 1 1 0
[1] 1 1
[1] 1 0 1 1
Question
What is a faster way to implement this while still using base R?
What about using tapply?
out_x <- tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T))
#------
$`1`
[1] 0 1 1 1
$`2`
[1] 0 1
$`3`
[1] 1 1 1 1
And then to reassign to df
df$x <- unlist(out_x)
A timing test:
microbenchmark::microbenchmark(f_loop(), f_apply())
#---------
Unit: microseconds
expr min lq mean median uq max neval
f_loop() 399.895 425.1975 442.7077 437.754 450.690 612.969 100
f_apply() 100.449 106.9185 160.5557 110.913 114.909 4867.603 100
Where the functions are defined as
f_loop <- function(){
idxb <- 1
idxe <- length(df[which(df$batch == 1),]$batch)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- df[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
#print(out_x)
#save output
df$x[idxb:idxe] <- out_x
#update indices
idxb <- idxb + length(out_x)
if(i < length(unique(df$batch))) {
idxe <- idxe + length(df[which(df$batch == i+1),]$batch)
}
}
return(df$x)
}
f_apply <- function() {
unlist(tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T)))
}
One solution is to remind myself that I can index a vector with a vector!
set.seed(2021)
for(i in seq_len(length(unique(df$batch)))){
batch_val <- d[which(df$batch == i),]$batch
#some algorithm to generate 1's and 0's, but using sample() here
out_x <- sample(c(0,1), length(batch_val), replace = T)
print(out_x)
#save output
idx <- which(df$batch == i)
df$x[idx] <- out_x
}
I'm looking for a way to identify a growing season which consists of a number of days greater than say 60 between the last frost day of spring and the first frost day in the fall. A general version of this problem is this. If I have a vector of numbers like testVec, I want the item numbers of the beginning and end range of values where the number of items is 5 or greater and all of them are greater than 0.
testVec <- c(1,3,4,0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
In this example, the relevant range is 1,3,4,6,7,5,9 which is testVec[9] to testVec[15]
One option could be:
testVec[with(rle(testVec > 0), rep(lengths * values >= 5, lengths))]
[1] 1 3 4 6 7 5 9
Here, the idea is to, first, create runs of values that are smaller or equal to zero and bigger than zero. Second, it checks whether the runs of values bigger than zero are of length 5 or more. Finally, it subsets the original vector for the runs of values bigger than zero with length 5 or more.
1) rleid This also handles any number of sequences including zero. rleid(ok) is a vector the same length as ok such that the first run of identical elements is replaced with 1, the second run with 2 and so on. The result is a list of vectors where each vector has its positions in the original input as its names.
library(data.table)
getSeq <- function(x) {
names(x) <- seq_along(x)
ok <- x > 0
s <- split(x[ok], rleid(ok)[ok])
unname(s)[lengths(s) >= 5]
}
getSeq(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
getSeq(numeric(16))
## list()
getSeq(c(testVec, 10 * testVec))
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
##
## [[2]]
## 25 26 27 28 29 30 31
## 10 30 40 60 70 50 90
If a data frame were desired then following gives the values and which sequence the row came from. The row names indicate the positions in the original input.
gs <- getSeq(c(testVec, 10 * testVec))
names(gs) <- seq_along(gs)
if (length(gs)) stack(gs) else gs
## values ind
## 9 1 1
## 10 3 1
## 11 4 1
## 12 6 1
## 13 7 1
## 14 5 1
## 15 9 1
## 25 10 2
## 26 30 2
## 27 40 2
## 28 60 2
## 29 70 2
## 30 50 2
## 31 90 2
2) gregexpr Replace each element that is > 0 with 1 and each other element with 0 pasting the 0's and 1's into a single character string. Then use gregexpr to look for sequences of 1's at least 5 long and for the ith such nonoverlapping sequence return the first positions, g, and lengths, attr(g, "match.length"). Define a function vals which extracts the values at the required positions from testVec of the ith such nonoverlapping sequence returning a list such that the ith component of the list is the ith such sequence. The names in the output vector are its positions in the input.
getSeq2 <- function(x) {
g <- gregexpr("1{5,}", paste(+(x > 0), collapse = ""))[[1]]
vals <- function(i) {
ix <- seq(g[i], length = attr(g, "match.length")[i])
setNames(x[ix], ix)
}
if (length(g) == 1 && g == -1) list() else lapply(seq_along(g), vals)
}
getSeq2(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
The above handles any number of sequences including 0 but if we knew there were exactly one sequence (which is the case for the example in the question) then it could be simplified to the following where the return value is just that vector:
g <- gregexpr("1{5,}", paste(+(testVec > 0), collapse = ""))[[1]]
ix <- seq(g, length = attr(g, "match.length"))
setNames(testVec[ix], ix)
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
You could "fix" #tmfmnk's solution like this:
f1 <- function(x, threshold, n) {
range(which(with(rle(x > threshold), rep(lengths * values >= n, lengths))))
}
x <- c(1, 3, 4, 0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
f1(x, 0, 5)
#[1] 9 15
But that does not work well when there are multiple runs
xx <- c(x, x)
f1(xx, 0, 5)
#[1] 9 31
Here is another, not so concise approach that returns the start and end of the longest run (the first one if there are ties).
f2 <- function(x, threshold, n) {
y <- x > threshold
y[is.na(y)] <- FALSE
a <- ave(y, cumsum(!y), FUN=cumsum)
m <- max(a)
if (m < n) return (c(NA, NA))
i <- which(a == m)[1]
c(i-m+1, i)
}
f2(x, 0, 5)
#[1] 9 15
f2(xx, 0, 5)
#[1] 9 15
or with rle
f3 <- function(x, threshold, n) {
y <- x > threshold
r <- rle(y)
m <- max(r$lengths)
if (m < n) return (c(NA, NA))
i <- sum(r$lengths[1:which.max(r$lengths)[1]])
c(i-max(r$lengths)+1, i)
}
f3(x, 0, 5)
#[1] 9 15
f3(xx, 0, 5)
#[1] 9 15
If you wanted the first run that is at least n, that is you do not want a next run, even if it is longer, you could do
f4 <- function(x, threshold, n) {
y <- with(rle(x > threshold), rep(lengths * values >= n, lengths))
i <- which(y)[1]
j <- i + which(!y[-c(1:i)])[1] - 1
c(i, j)
}
I'm trying to create a random data set in R that has metric, binomial and categorical variables. However, in the end when I check the class of my categorical variables R says they are numeric, but I need them to be factors for my further analysis. Does anybody have an idea what I'm doing wrong here?
that's my code:
set.seed(3456)
R.dat <- function(n = 5000,metr=1,bin=1,cat=3) {
j <- metr
X <- (matrix(0,n,j))
for (i in 1:n) {
X[i,] <- rnorm(j, mean = 0, sd = 1)
}
BIN <- matrix(0,n,bin)
for (i in 1:bin) {
BIN[,i] <- rbinom(n,1, 0.5)
}
CAT <- matrix(0,n,cat)
for (i in 1:cat) {
CAT[,i] <- factor(sample(1:4, n, TRUE))
}
X <- as.data.frame(cbind(X,BIN, CAT))
return(X)
}
Dat <- R.dat(n=5000,metr=1,bin=1, cat=3)
summary(Dat)
If I just sample like this:
x <- factor(sample(1:4, n, TRUE))
class(x)
it says x is a factor, so I don't get why it doesn't do the same when I use it in the function and loop...any help is much apprecciated, thanks in advance!
When you do this:
CAT <- matrix(0,n,cat)
for (i in 1:cat) {
CAT[,i] <- factor(sample(1:4, n, TRUE))
}
you create a numeric matrix CAT, and then you assign a new value to a subset of that matrix. When you do that assignment, the new value is coerced to the type of CAT, which is numeric.
Also, when you cbind the matrices X, BIN and CAT at the end, you coerce all of them to a common type. This would again mess up your variable types, even assuming everything was working correctly up to this point.
The rest of your code can also be simplified considerably. In particular, you don't need looping to reassign values to matrices; you can call the matrix constructor function directly on a vector of values.
Try this instead:
R.dat <- function(n=5000, metr=1, bin=1, cat=3)
{
X <- matrix(rnorm(n * metr), nrow=n)
B <- matrix(rbinom(n * bin, 1, 0.5), nrow=n)
F <- matrix(as.character(sample(1:4, n * cat, TRUE)), nrow=n)
data.frame(X=X, B=B, F=F)
}
You don't need a loop, If you switch to data.table, you can generate them by reference.
library(data.table)
n <- 10
bin <- 1
DT <- data.table(X=replicate(n, rnorm(bin, mean=0, sd = 1)),
BIN = rbinom(n,1, 0.5),
CAT = factor(sample(1:4, n, TRUE)))
## If you need you can add more columns
cols <- paste0("CAT", 1:3)
DT[, (cols):= lapply(rep(n, 3) ,rbinom, 1, .5) ]
cols <- paste0("BIN", 1:3)
DT[, (cols):= lapply(rep(n, 3) ,function(x){factor(sample(1:4, n, TRUE)) }) ]
DT
lapply(DT, class)
DT
X BIN CAT CAT1 CAT2 CAT3 BIN1 BIN2 BIN3
1: 1.2934720 1 2 0 0 0 1 1 2
2: -0.1183180 1 2 0 0 1 3 3 1
3: 0.3648810 1 2 1 1 1 3 2 3
4: -0.2149963 1 2 1 1 0 2 3 2
5: 0.3204577 1 1 0 1 1 2 2 4
6: -0.5941640 0 4 1 0 0 2 3 1
7: -1.8852835 1 4 1 0 0 2 1 1
8: -0.8329852 0 2 0 0 1 1 1 2
9: -0.1353628 0 4 0 1 1 1 4 1
10: -0.2943969 1 4 0 1 0 4 3 3
> lapply(DT, class)
$X
[1] "numeric"
$BIN
[1] "integer"
$CAT
[1] "factor"
$CAT1
[1] "integer"
$CAT2
[1] "integer"
$CAT3
[1] "integer"
$BIN1
[1] "factor"
$BIN2
[1] "factor"
$BIN3
[1] "factor"
Because matrix does not accept factor vector, it will be coerced into numbers.
Just change it into a dataframe :
CAT <- matrix(0,n,cat)
CAT <- as.data.frame(CAT)
This will do the trick.