I have a dataframe, df, of two columns, x and y. I am trying to sum values within column y and put the sums into another dataframe. The summing only occurs for a section of column y between NA values. There are multiple sections of column y that must be summed but I want each sum to be a separate value in the new data frame.
df <- data.frame(x = c(1966,0.1,0.2,0.3,0.4,5622,0.9,0.8,0.7,0.6,7889),
y = c(NA,1,2,3,4,NA,9,8,7,6,NA))
The answer should be in the format of a data frame with one column of two rows:
df <- data.frame(x = c(10,30))
I thought of solving this using some for loop and if statements for values between values of NA in column y. Any ideas?
So far, I have the following code, but I ultimately want it to work for a column with a series of more than two summations:
NAs <- which(is.na(df$y))
L1 <- length(NAs)
L0 <- dim(df)[1]
soln1 <- data.frame(matrix(nrow = L1-1, ncol = 1))
for(i in 1:L0){
for(j in 1:L1){
if (j == L1){
break
} else
soln1[j,1] <- sum(df[NAs[j] +1,2]:df[NAs[j+1] -1,2])
}
}
I took a stab at it with some fake data:
df <- data.frame(x = c(1,1,3,1,3,1,1,1,1,1,3,1,1,1,1,1),
y = c(1,2,NA,4,5,NA,7,8,NA,10,11,NA,13,14,NA,16))
# df
# x y
#1 1 1
#2 1 2
#3 3 NA
#4 1 4
#5 3 5
#6 1 NA
#7 1 7
#8 1 8
#9 1 NA
#10 1 10
#11 3 11
#12 1 NA
#13 1 13
#14 1 14
#15 1 NA
#16 1 16
The magic function:
# sum rows in y if section is between NA values & before a value in column x that is > 2
specialSum <- function(x, y){
starting <- which(c(NA,x[-length(x)]) > 2 & is.na(y))
NAs <- which(is.na(y))
L <- length(starting)
ending <- sapply(1:L, function(z) NAs[NAs[-starting] > starting[z]][1])
output <- matrix(NA, nrow = L)
naming <- rep("",L)
for(i in 1:L){
output[i] <- sum(y[starting[i]:ending[i]], na.rm = T)
naming[i] <- paste0(starting[i]+1,":",ending[i]-1)
}
dimnames(output) <- list(naming, "specialSum")
output
}
specialSum(df$x, df$y)
# specialSum
#7:8 15
#13:14 27
EDIT:
df <- data.frame(x = c(1966,0.1,0.2,0.3,0.4,5622,0.9,0.8,0.7,0.6,7889),
y = c(NA,1,2,3,4,NA,9,8,7,6,NA))
specialSum <- function(y){
NAs <- which(is.na(y))
starting <- NAs[-length(NAs)]+1
ending <- NAs[-1]-1
L <- length(starting)
sums <- matrix(NA, nrow = L) ; naming <- rep("",L) # initialize for speed
for(i in 1:L){
sums[i] <- sum(y[starting[i]:ending[i]], na.rm = T)
naming[i] <- paste0(starting[i],":",ending[i])
}
sums <- sums[sums != 0,,drop = F] # in case there are multiple NAs in a row
data.frame(specialSum = sums, row.names = naming)
}
specialSum(df$y)
# specialSum
#2:5 10
#7:10 30
EDIT#2:
NAs <- which(is.na(df$y))
sumlist <- vector("list", length(NAs)-1)
count <- 0
for(i in 1:nrow(df)){
if(i %in% NAs){
count = count + 1
} else {
sumlist[[count]] <- append(sumlist[[count]], df$y[i])
}
}
data.frame(specialSum = unlist(lapply(sumlist, sum))) # less pretty output
# specialSum
#1 10
#2 30
Related
R data frame 1 :
Index
Powervalue
0
1
1
2
2
4
3
8
4
16
5
32
R dataframe 2 :
CombinedValue
20
50
Expected Final Result :
Can we get the output as in the image. If yes please help.
One of stackoverflow mate provided below code. Am looking how to seperate , values as columns with 1 and 0.
df <- data.frame(sum = c(50, 20, 6))
values_list <- list()
for (i in 1:nrow(df)) {
sum <- df$sum[i]
values <- c()
while (sum > 0) {
value <- 2^floor(log2(sum))
values <- c(values, value)
sum <- sum - value
}
values_list[[i]] <- values
}
df$values <- values_list
Can we fix columns till power 31 as shown in attached image. The columns match with possiblecodes then place 1 and 0 else 0 for the remaining columns. Please help.
Here is a function whose output matches the expected output.
toCodes <- function(x) {
n <- floor(log2(x))
pow <- rev(seq.int(max(n)))
# 'y' is the matrix of codes
y <- t(sapply(x, \(.x) (.x %/% 2^pow) %% 2L))
i_cols <- apply(y, 2, \(.y) any(.y != 0L))
colnames(y) <- sprintf("code_%d", 2^pow)
#
possiblecodes <- apply(y, 1, \(p) {
codes <- 2^pow[as.logical(p)]
paste(rev(codes), collapse = ",")
})
data.frame(combinedvalue = x, possiblecodes, y[, i_cols])
}
x <- c(20L, 50L)
toCodes(x)
#> combinedvalue possiblecodes code_32 code_16 code_4 code_2
#> 1 20 4,16 0 1 1 0
#> 2 50 2,16,32 1 1 0 1
Created on 2022-12-19 with reprex v2.0.2
I would like to know how to split a vector by a percentage. I tried to use the stats :: quantile function but it doesn't manage to separate correctly when there are several times the same values.
I would like a method that does the split only by taking into account the length of the vector without taking into account the values.
vector <- c(1,1,1, 4:10)
minProb <- 0.1
maxProb <- 0.9
l <- length(vector)
dt <- data.frame("id" = 1:l, "value" = vector)
dt <- dt %>% arrange(act)
#min <- l*minProb
#max <- l*maxProb
#data1 <- dt$id[min:max]
#data2 <- dt$id[-c(min:max)]
#q <- quantile(dt$act, probs=c(minProb,maxProb))
#w <- which(dt$act >= q[1] & dt$act <= q[2])
expected result (index of elements)
> g2
1 10
> g1
2 3 4 5 6 7 8 9
The following does split the vector, whether that's what the question asks for is not clear.
l <- length(vector)
qq <- quantile(seq_along(vector), probs = c(minProb, maxProb))
f <- logical(l)
f[round(qq[1])] <- TRUE
f[round(qq[2])] <- TRUE
split(vector, cumsum(f))
#$`0`
#[1] 1
#
#$`1`
#[1] 1 1 4 5 6 7 8
#
#$`2`
#[1] 9 10
In order to have the indices, like it is asked in a comment, do
split(seq_along(vector), cumsum(f))
I have a function runBootstrap whose output result is a vector of variable length (depending on # of values for cat, which itself is a product of test). Apologies that this isn't "minimal".
require(dplyr)
test <- function(combo) {
if(combo[1] == 4) {
cat <- 4
} else if((combo[1] == 3 & combo[2] == 2) | (combo[1] == 2 & combo[2] == 2)) {
cat <- 3
} else if((combo[1] == 2 & combo[2] == 1) | (combo[1] == 1 & combo[2] == 2)) {
cat <- 2
} else {
cat <- 1
}
}
arg1.freqs <- c(0.5, 0.2, 0.1, 0.1)
arg2.freqs <- c(0.8, 0.2)
runBootstrap <- function(arg1.freqs, arg2.freqs) {
sim.df <- data.frame(x1 = 1:10000, y1 = NA)
sim.df$x1 <- sample(1:4, 10000, replace = TRUE,
prob = arg1.freqs)
sim.df$y1 <- sample(1:2, 10000, replace = TRUE,
prob = arg2.freqs)
sim.df$cat <- NA
for(i in 1:nrow(sim.df)) {
combo <- c(sim.df[i, 1], sim.df[i, 2])
sim.df$cat[i] <- test(combo)
}
sim.df <- sim.df %>%
select(cat) %>%
group_by(cat) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
sim.df <- as.data.frame(sim.df)
result <- c(sim.df[1, 3], sim.df[2, 3])
}
In this current version there are only two values for cat so result is a vector of length 2; in a future version I will adjust code so that length(result) will equal # values of cat.
When using the function in a for loop, I would like to use the vector values to create new columns in an already existing data.frame df1. The code I've tried thus far is as follows:
df1$result <- NA
for (i in 1:nrow(df1)) {
df1$result[i] <- runBootstrap(arg1.freqs, arg2.freqs)
}
This clearly doesn't work unless the result vector is length = 1. But I don't know the length of the vector until the function runs (although once it runs it will be same length each iteration).
What I would like to achieve is the following:
Example 1: if length(result) == 2
df1.col x1 x2
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
Example 2: if length(result) == 3
df1.col x1 x2 x3
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5
6 6 6 6 6
Thanks for any advice or direction.
edited for clarification
UPDATE - edited with solution
I got it to work as I wanted by creating a blank list, populating, then using rbind as follows:
appendResults <- function(df1, arg1, arg2) {
my.list <- vector("list", nrow(df1))
for (i in 1:nrow(df1)) {
arg1.freqs <- as.numeric(arg1[i, 3:6])
arg2.freqs <- as.numeric(arg2[i, 3:4])
my.list[[i]] <- runBootstrap(arg1.freqs, arg2.freqs)
}
result.df <- do.call(rbind, my.list)
df2 <- do.call(cbind, list(df1, result.df))
}
Check this one, not sure what the result looks like, but this creates empty columns, equal to the length of results, with NAs:
# fake data frame
df1 <- data.frame(x = c(1,2,3), y = c("a", "b", "c"))
# say result has length 3
res <- c(5,6,7)
# make columns with names x1, ..., x + length of res
# and assign NA values to those column
df1[ , paste("x", 1:length(res), sep = "")] <- NA
I am trying to construct a matrix in a for loop. I am declaring an empty matrix called matrix1 in my code and I am trying to fill it row-wise with the simulated x, y and z variables:
simulation <- function(ss){
nsim <- 100
matrix1 <- matrix(0, ncol=3, nrow=nsim)
colnames(matrix1) <- c("x", "y", "z")
for(i in 1:nsim) {
set.seed(i)
x <- relevel(as.factor(sample(1:4,ss, replace=TRUE)), ref="4")
y <- relevel(as.factor(sample(1:3,ss, replace=TRUE)), ref="3")
z <- relevel(as.factor(sample(1:2,ss, replace=TRUE)), ref="2")
matrix1[i, ] <- cbind(x, y, z)
}
return(matrix1)
}
Now, when I run this, I am getting an error:
Error in matrix1[i, ] <- cbind(x, y, z) :
number of items to replace is not a multiple of replacement length
.
I don't see why that is happening since matrix1 has 3 columns and I am filling it recursively with the 3 variables x, y and z.
Too fill it row by row, do like this (this is not efficient):
x = data.frame(name = character(), y = numeric())
for(i in 1:10){
x = rbind(x, data.frame(name = letters[i], y = runif(1)))
}
# name y
1 a 0.09931082
2 b 0.85088120
3 c 0.39535348
4 d 0.08633770
5 e 0.08329996
6 f 0.46080032
7 g 0.39309986
8 h 0.01993358
9 i 0.96079532
10 j 0.19371701
If you have a lot of rows, this is very inefficient. In this case, you could try to pre-allocate the rows:
n = 10
x = data.frame(name = rep(NA_character_, n), y = rep(NA_real_, n))
name y
1 <NA> NA
2 <NA> NA
3 <NA> NA
4 <NA> NA
5 <NA> NA
6 <NA> NA
7 <NA> NA
8 <NA> NA
9 <NA> NA
10 <NA> NA
Then loop over the rows and columns to set the values.
for (i in 1:n){
for (j in 1:2){
# x[i, j] = ...
}
}
Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?
Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)
A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA