Using cumsum by columns - r

I have a matrix, for example:
A= [ 1 2 3 4
3 5 6 6
4 1 2 3 ]
I want to get the cumulative sum of the columns in the form of another matrix (or data frame). For example, this matrix would give:
B= [1 2 3 4
4 7 9 10
8 8 8 13]

If A is a matrix, use apply:
A <- structure(c(1, 3, 4, 2, 5, 1, 3, 6, 2, 4, 6, 3), .Dim = 3:4)
B <- apply(A, 2, cumsum)
B
# [,1] [,2] [,3] [,4]
# [1,] 1 2 3 4
# [2,] 4 7 9 10
# [3,] 8 8 11 13
If A is a data.frame, use lapply:
B <- as.data.frame(A)
B[] <- lapply(B, cumsum)
B
# V1 V2 V3 V4
# 1 1 2 3 4
# 2 4 7 9 10
# 3 8 8 11 13

Using dplyr for data frames:
library(dplyr)
m <- matrix(c(1, 3, 4, 2, 5, 1, 3, 6, 2, 4, 6, 3), 3, 4)
as.data.frame(m) %>% mutate_each(funs(cumsum))
With sqldf:
library(sqldf)
df <- as.data.frame(m)
sqldf('SELECT SUM(b.V1) V1, SUM(b.V2) V2, SUM(b.V3) V3, SUM(b.V4) V4
FROM df AS a, df AS b WHERE b.rowid <= a.rowid
GROUP BY a.V1 ORDER BY a.rowid')
Output:
V1 V2 V3 V4
1 1 2 3 4
2 4 7 9 10
3 8 8 11 13

Related

How to repeat a data list with two vectors in R

I have a list data X with two vectors
X[1]=(1,2,3,5,6,9,7,8)
X[2]=(2,3,4,5,6)
I want to get a new list data Y
Y[1]=(1,2,3,5,6,9,7,8,1,2,3,5,6,9,7,8)-repeat x[1]
Y[2]=(2,3,4,5,6,2,3,4,5,6)-repeat x[2]
I used Y<-rep(X,2) but get
Y[1]:(1,2,3,5,6,9,7,8)
Y[2]:(2,3,4,5,6)
Y[3]:(1,2,3,5,6,9,7,8)
Y[4]:(2,3,4,5,6)
How to do it right? Many thanks.
Use sapply/lapply :
sapply(X, rep, 2)
#[[1]]
# [1] 1 2 3 5 6 9 7 8 1 2 3 5 6 9 7 8
#[[2]]
# [1] 2 3 4 5 6 2 3 4 5 6
data
X <- list(c(1, 2, 3, 5, 6, 9, 7, 8), c(2, 3, 4, 5, 6))
You are having problems accessing the list elements - use [[1]] etc.
X <- list( c(1,2,3,5,6,9,7,8),
c(2,3,4,5,6))
Y = list(rep(X[[1]], 2),
rep(X[[2]], 2))
# R > Y
# [[1]]
# [1] 1 2 3 5 6 9 7 8 1 2 3 5 6 9 7 8
#
# [[2]]
# [1] 2 3 4 5 6 2 3 4 5 6
Using map from purrr
library(purrr)
map(X, rep, 2)
data
X <- list(c(1, 2, 3, 5, 6, 9, 7, 8), c(2, 3, 4, 5, 6))

Function for testing if a row in a dataframe doesn't contain a given element

I have a dataframe like
a <- c(2, 3, 4)
b <- c(5, 4, 3)
c <- c(2, 7, 9)
df <- data.frame(a, b, c)
df
# a b c
# 1 2 5 2
# 2 3 4 7
# 3 4 3 9
and I want to get back the row without number 2, in my example it is just second row.
Using rowSums or colSums:
# data
a <- c(2, 3, 4)
b <- c(5, 4, 3)
c <- c(2, 7, 9)
df <- data.frame(a, b, c)
df
# a b c
# 1 2 5 2
# 2 3 4 7
# 3 4 3 9
# get rows with no 2
df[ rowSums(df == 2, na.rm = TRUE) == 0, ]
# a b c
# 2 3 4 7
# 3 4 3 9
# get columns with no 2
df[ , colSums(df == 2, na.rm = TRUE) == 0, drop = FALSE ]
# b
# 1 5
# 2 4
# 3 3
We can also use Reduce with == to get the rows
df[!Reduce(`|`, lapply(df, `==`, 2)),]
# a b c
#2 3 4 7
#3 4 3 9
and any with lapply to select the columns
df[!sapply(df, function(x) any(x== 2))]
# b
#1 5
#2 4
#3 3
Here is my solution using some set functions. First, where are the positions of the twos?
is_two <- apply(df, 1, is.element, 2)
[,1] [,2] [,3]
[1,] TRUE FALSE FALSE
[2,] FALSE FALSE FALSE
[3,] TRUE FALSE FALSE
Now, which rows are all FALSE?
no_twos <- apply(!is_two, 1, all)
df[no_twos,]
a b c
2 3 4 7

Create all pairs within groups and maintaining variables

I have a dataframe with around 30k observations, divided in 300 groups. For example
id, group, x, y
1, 1, 2, 3
2, 1, 4, 3
3, 1, 2, 4
4, 2, 5, 4
5, 2, 5, 3
6, 2, 6, 4
I want to make it so
pair, group, x_i, x_j, y_i, y_j
12, 1, 2, 4, 3, 3
13, 1, 2, 2, 3, 4
23, 1, 4, 2, 3, 4
45, 2, 5, 5, 4, 3
and so on. I've found a few topics, but they don't seem to apply exactly to my problem.
The combn function can be used to generate each corresponding pair of x and y values. We operate by group using lapply. lapply returns a list so we use rbind to put each list element (the results for each group) back together in a single data frame.
new.dat = lapply(unique(dat$group), function(g) {
data.frame(pairs = apply(t(combn(dat$id[dat$group==g], 2)), 1, paste, collapse=""),
group=g,
x = t(combn(dat$x[dat$group==g], 2)),
y = t(combn(dat$y[dat$group==g], 2)))
})
do.call(rbind, new.dat)
pairs group x.1 x.2 y.1 y.2
1 12 1 2 4 3 3
2 13 1 2 2 3 4
3 23 1 4 2 3 4
4 45 2 5 5 4 3
5 46 2 5 6 4 4
6 56 2 5 6 3 4
You could also use split, which saves some typing, but is about 10% slower on my machine:
lapply(split(dat, dat$group), function(df) {
data.frame(pairs = apply(t(combn(df$id, 2)), 1, paste, collapse=""),
group=g,
x = t(combn(df$x, 2)),
y = t(combn(df$y, 2)))
})
I won't say this is an ooptimal result, but it should work:
df <- read.table(text="id, group, x, y
1,1,2,3
2,1,4,3
3,1,2,4
4,2,5,4
5,2,5,3
6,2,6,4", header=T, sep=",")
df.new <- do.call(rbind,lapply(tapply(df$id, df$group, combn, m=2), FUN=function(x) data.frame(pairi=x[1,], pairj=x[2,])))
df.new <- do.call(rbind,apply(df.new, 1, FUN=function(x) data.frame(pair=paste0(x[1], x[2]),group=df[df$id==x[1], 'group'], x_i=df[df$id==x[1],'x'], x_j=df[df$id==x[2],'x'], y_i=df[df$id==x[1],'y'], y_j=df[df$id==x[2],'y'] )))
df.new
pair group x_i x_j y_i y_j
1.1 12 1 2 4 3 3
1.2 13 1 2 2 3 4
1.3 23 1 4 2 3 4
2.1 45 2 5 5 4 3
2.2 46 2 5 6 4 4
2.3 56 2 5 6 3 4

New variable with values depending on combination of other variables

I'm very inexperienced in R, and although this site has been tremendously helpful, I have a very specific situation and cannot find a solution. I imagine I need to write a function to accomplish this. However, my current time frame does not allow me to spend the time doing trial/error. (I apologize in advance for anything unclear).
Here is an example of my current data:
UniqueID, Time1.Feel1, Time2.Feel1.1, Time2.Feel1.2, Time2Num
1, 9, 5, 6, 1
1, 9, 7, 5, 2
2, 4, 3, 4, 1
2, 4, 5, 6, 2
3, 7, 4, 7, 1
3, 7, 6, 5, 2
I want to create a new variable: Time2.Feel1, which consists of the values of either Time2.Feel1.1 OR Time2.Feel1.2, depending on the value of Time2Num.
So, this:
UniqueID, Time1.Feel1, Time2.Feel1.1, Time2.Feel1.2, Time2Num, Time2.Feel1
1, 9, 5, 6, 1, 5
1, 9, 7, 5, 2, 5
2, 4, 3, 4, 1, 3
2, 4, 5, 6, 2, 6
3, 7, 4, 7, 1, 4
3, 7, 6, 5, 2, 5
I need to do this 30 times (i.e., Time2Num has values 1:30 and there are 30 different Time2.Feel1 variables: Time2.Feel1.1:30)
I then want to calculate a correlation between Time1.Feel1 and Time2.Feel1 for EACH UniqueID, creating a new data frame with the variables UniqueID and the new correlations. This part is less of a concern; I think I've figured out how to that, but if the combined steps could be done more simply, I'd prefer that.
Thanks in advance!
To expound on #thelatemail's comment, you could do this
dat <- read.csv(text="UniqueID, Time1.Feel1, Time2.Feel1.1, Time2.Feel1.2, Time2Num
1, 9, 5, 6, 1
1, 9, 7, 5, 2
2, 4, 3, 4, 1
2, 4, 5, 6, 2
3, 7, 4, 7, 1
3, 7, 6, 5, 2")
dat$Time2.Feel1 <- dat[c("Time2.Feel1.1","Time2.Feel1.2")][cbind(seq(nrow(dat)),dat$Time2Num)]
# UniqueID Time1.Feel1 Time2.Feel1.1 Time2.Feel1.2 Time2Num Time2.Feel1
# 1 1 9 5 6 1 5
# 2 1 9 7 5 2 5
# 3 2 4 3 4 1 3
# 4 2 4 5 6 2 6
# 5 3 7 4 7 1 4
# 6 3 7 6 5 2 5
Doing that 30 times isn't very efficient, so you could use a loop:
## creating some example data which I think matches your format
nr <- nrow(dat)
set.seed(1)
dat1 <- lapply(1:15, function(ii)
matrix(c(sample(1:9, nr * 2, replace = TRUE),
sample(1:2, nr, replace = TRUE)), nrow = nr,
dimnames = list(NULL, c(paste0('Time2.Feel1.', 1 + 2 * (ii - 1)),
paste0('Time2.Feel1.', 2 + 2 * (ii - 1)),
sprintf('Time%sNum', 2 + 2 * (ii - 1))))))
dat1 <- data.frame(do.call('cbind', dat1))
# Time2.Feel1.1 Time2.Feel1.2 Time2Num Time2.Feel1.3 Time2.Feel1.4 Time4Num
# 1 3 9 2 4 3 1
# 2 4 6 1 7 4 2
# 3 6 6 2 9 1 1
# 4 9 1 1 2 4 1
# 5 2 2 2 6 8 2
# 6 9 2 2 2 4 2
# Time2.Feel1.5 Time2.Feel1.6 Time6Num Time2.Feel1.7 Time2.Feel1.8 Time8Num
# 1 8 8 2 1 9 1
# 2 1 5 2 1 3 2
# 3 7 5 1 3 5 1
# 4 4 8 2 5 3 2
# 5 8 1 1 6 6 1
# 6 6 5 1 4 3 2
# Time2.Feel1.9 Time2.Feel1.10 Time10Num Time2.Feel1.11 Time2.Feel1.12 Time12Num
# 1 4 7 2 3 5 1
# 2 4 9 1 1 4 2
# 3 5 4 2 6 8 2
# 4 9 7 1 8 6 1
# 5 8 4 1 8 6 1
# 6 4 3 1 8 4 1
etc, etc
So you can start here. First you make the input vectors:
I call xx which is Time2.Feel1, Time2.Feel3, Time2.Feel5, etc
yy which is Time2.Feel2, Time2.Feel4, Time2.Feel6, etc; xx and yy are your two "choices"
and zz which is the "decision" column, Time2Feel1, Time4Feel1, Time6Feel1, etc
Then use mapply to do the indexing above but in a 1-1 mapping using those three input vectors with mapply. Note that zz, yy, and xx are all the same length
n <- 30
xx <- paste0('Time2.Feel1.', seq(1, n - 1, by = 2))
yy <- paste0('Time2.Feel1.', seq(2, n, by = 2))
zz <- sprintf('Time%sNum', seq(2, n, by = 2))
nn <- sprintf('Time%s.Feel1', seq(2, n, by = 2))
res <- mapply(function(x, y, z) dat1[, c(x, y)][cbind(1:nr, dat1[, z])],
xx, yy, zz, SIMPLIFY = FALSE)
res <- `colnames<-`(do.call('cbind', res), nn)
# Time2.Feel1 Time4.Feel1 Time6.Feel1 Time8.Feel1 Time10.Feel1 Time12.Feel1
# [1,] 9 4 8 1 7 3
# [2,] 4 4 5 3 4 4
# [3,] 6 9 7 3 4 8
# [4,] 9 2 8 3 9 8
# [5,] 2 8 8 6 8 8
# [6,] 2 4 6 3 4 8
And then you can combine the results back. You would need to reorder them if that is important to you
## combine results into original data
cbind(dat1, res)
When searching for the error I received when trying the answer from #user12202013, I came across this solution using ifelse, found here: Conditional assignment of one variable to the value of one of two other variables
Time2.Feel1 <- ifelse(Time2Num == 1, Time2.Feel1.1, ifelse(Time2Num == 2,
Time2.Feel1.2,""))
Although it is definitely not the most efficient solution, particularly because I need to nest it 30 times and I need to do it for 9 items, it solved my problem. A simpler answer is still welcome, though!
Thanks for your answers!
You want to do something like:
Time2.Feel1 = rep(NA, length(Time2Num))
Time2.Feel1[Time2Num == 1] <- Time2.Feel1.1
Time2.Feel1[Time2Num == 2] <- Time2.Feel1.2
This says to create a vector called Time2.Feel1 which we initialize with NA values. Then where Time2Num is one we fill in the values from Time2.Feel1.1 and where Time2Num is two we fill in the values from Time2.Feel1.2. If there is any place where Time2Num is neither 1 nor 2 thenTime2.Feel1` will have an NA value.
Edit:
Not sure what the error message is referring to since I am able to do this
# reproducible example
set.seed(1)
A <- letters
B <- sample(c(0, 1, NA), 26, TRUE)
A[B == 1] <- '5' # assignment where subscript contains NAs
A[B == 0] <- NA # assigning NA values
A
[1] NA "5" "5" "d" NA "f" "g" "5" "5" NA NA NA "m" "5" "o" "5" "q" "r" "5" "t" "u" NA "5" NA NA "5"
I would need to see more complete code to know what is causing the error.

How to calculate the difference between different data frames with common column names

I have three data frames and trying to calculate the difference between two data frames (Df2 and Df3) conditioned by data frame 1. As explained in following example I have three data frames, Df1, Df2 and Df3 with common names. In first step, in Df1, I want to compare the values of “standard” column with all three columns, “Das”,”Dss” and ”Tri” probably row wise and where ever any value of these columns, “Das”, “Dss” and “Tri” is higher than the “Standard” in Df1, calculate the difference of same position in Df2 and Df3 and put the difference in a separate column.
Df1
Names Standard Das Dss Tri
Aa 3 3 6 2
Ab 4 6 4 3
Ac 2 5 2 4
Ad 4 3 3 8
Ae 6 4 5 7
Af 4 5 7 5
Ag 2 6 8 2
Ah 9 7 6 2
Df2
Names Das Dss Tri
Aa 4 2 5
Ab 7 5 4
Ac 5 7 2
Ad 6 4 3
Ae 5 3 5
Af 3 2 6
Ag 2 5 4
Ah 4 6 3
Df3
Names Das Dss Tri
Aa 5 3 5
Ab 8 5 4
Ac 6 7 2
Ad 6 4 7
Ae 5 3 8
Af 4 5 6
Ag 1 5 4
Ah 4 6 3
Final Ouput
Df3
Names Das Dss Tri Difference
Aa 5 3 5 -1
Ab 8 5 4 -1
Ac 6 7 2 -1
Ad 6 4 7 -4
Ae 5 3 8 -3
Af 4 5 6 -4
Ag 1 5 4 1
Ah 4 6 3 0
Here's the script that takes the index of the first biggest value if more than 1 value is found and if no values are found, NA is returned.
df1 <- structure(list(standard = c(3, 4, 2, 4, 6, 4, 2, 9), das = c(3,
6, 5, 3, 4, 5, 6, 7), dss = c(6, 4, 2, 3, 5, 7, 8, 6), tri = c(2,
3, 4, 8, 7, 5, 2, 2)), .Names = c("standard", "das", "dss", "tri"
), row.names = c(NA, -8L), class = "data.frame")
df2 <- structure(list(das = c(4, 7, 5, 6, 5, 3, 2, 4), dss = c(2,
5, 7, 4, 3, 2, 5, 6), tri = c(5,4,2,3,5,6,4,3)), .Names = c("das", "dss", "tri"
), row.names = c(NA, -8L), class = "data.frame")
df3 <- structure(list(das = c(5, 8, 6, 6, 5, 4, 1, 4), dss = c(3,
5, 7, 4, 3, 5, 5, 6), tri = c(5,4,2,7,8,6,4,3)), .Names = c("das", "dss", "tri"
), row.names = c(NA, -8L), class = "data.frame")
# get indices. run through every row of df1
# and get the maximum column index > standard
idx.v <- sapply( 1:nrow(df1), function(idx) {
t <- which(df1[idx, 2:4] > df1[idx, 1])
})
df3$result <- sapply(1:length(idx.v), function(ix) {
col.idx <- idx.v[[ix]]
len.idx <- length(col.idx)
if (len.idx > 0) {
res <- sum(df2[ix, col.idx] - df3[ix, col.idx])
} else {
res <- NA
}
})
Output:
> df3
das dss tri result
1 5 3 5 -1
2 8 5 4 -1
3 6 7 2 -1
4 6 4 7 -4
5 5 3 8 -3
6 4 5 6 -4
7 1 5 4 1
8 4 6 3 NA
Thanks for the chat. This is what you require.
I think this is the correct result, but note that the seventh value differs. Using the max value of the three columns (an easier task) produces a result that differs in even more slots.
df1.w <- sapply( seq(1, nrow(df1)),
function(idx) min(c(Inf, which(df1[-(1:2)][idx,] > df1[idx, 2])))
)
df1.mat <- matrix(c(seq(1, nrow(df1)), df1.w), ncol=2)
df1.mat[is.infinite(df1.mat)] <- 1
ifelse(is.infinite(df1.w), 0,
df2[-1][df1.mat] - df3[-1][df1.mat]
)
## [1] -1 -1 -1 -4 -3 -1 1 0
If you actually do want to use the index of the max value in df1[-(1:2)], replace the definition of df1.w (the sapply call) with this:
df1.w <- apply(df1[-(1:2)], 1, which.max)
Using the rest of the code above then gives this result:
## [1] -1 -1 -1 -4 -3 -3 0 0

Resources