r nested sum by groups - r

I am trying to implement this logic:
i j Eij
1 1 -1.5
1 2 -1.5
1 3 0.5
1 4 0.5
2 1 -0.5
2 2 -0.5
2 3 1.5
2 4 1.5
Each value in column Eij , by i, should be multiplied by the sum of values after that value.
For example, for i=1, the first value -1.5 (i=1, j=1). This value -1.5 should be multiplied by the sum of -1.5, 0.5, 0.5 all these values occur after the initial value -1.5 (i=1, j=1)
Similarly next value in i=1, i.e -1.5(i=1, j=2). This value -1.5 should be multiplied by the sum of 0.5, 0.5 all these values occur after the value -1.5 (i=1, j=2)
So on.
ro = -1.5*(-1.5 + 0.5 + 0.5) +
-1.5*(0.5 + 0.5) +
0.5*(0.5) +
-0.5*(-0.5 + 1.5 + 1.5) +
-0.5*(1.5 + 1.5) +
1.5*(1.5)

Here's a solution with dplyr:
library(dplyr)
df %>%
group_by(i) %>%
arrange(desc(j), .by_group = TRUE) %>%
mutate(
multiplier = lag(cumsum(Eij), default = 0),
result = Eij * multiplier
) %>%
arrange(j, .by_group = TRUE) %>%
ungroup
# # A tibble: 8 × 5
# i j Eij multiplier result
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 -1.5 -0.5 0.75
# 2 1 2 -1.5 1 -1.5
# 3 1 3 0.5 0.5 0.25
# 4 1 4 0.5 0 0
# 5 2 1 -0.5 2.5 -1.25
# 6 2 2 -0.5 3 -1.5
# 7 2 3 1.5 1.5 2.25
# 8 2 4 1.5 0 0

f <- function(v) v %*% c(rev(cumsum(rev(v)))[-1], 0)
sum(aggregate(df$Eij, list(df$i), FUN = f)$x)
#[1] -1
one value per row:
f <- function(v) v * c(rev(cumsum(rev(v)))[-1], 0)
ave(df$Eij, df$i, FUN = f)
#[1] 0.75 -1.50 0.25 0.00 -1.25 -1.50 2.25 0.00

Related

How to find the first column with a certain value for each row with dplyr

I have a dataset like this:
df <- data.frame(id=c(1:4), time_1=c(1, 0.9, 0.2, 0), time_2=c(0.1, 0.4, 0, 0.9), time_3=c(0,0.5,0.3,1.0))
id time_1 time_2 time_3
1 1.0 0.1 0
2 0.9 0.4 0.5
3 0.2 0 0.3
4 0 0.9 1.0
And I want to identify for each row, the first column containing a 0, and extract the corresponding number (as the last element of colname), obtaining this:
id time_1 time_2 time_3 count
1 1.0 0.1 0 3
2 0.9 0.4 0.5 NA
3 0.2 0 0.3 2
4 0 0.9 1.0 1
Do you have a tidyverse solution?
We may use max.col
v1 <- max.col(df[-1] ==0, "first")
v1[rowSums(df[-1] == 0) == 0] <- NA
df$count <- v1
-output
> df
id time_1 time_2 time_3 count
1 1 1.0 0.1 0.0 3
2 2 0.9 0.4 0.5 NA
3 3 0.2 0.0 0.3 2
4 4 0.0 0.9 1.0 1
Or using dplyr - use if_any to check if there are any 0 in the 'time' columns for each row, if there are any, then return the index of the 'first' 0 value with max.col (pick is from devel version, can replace with across) within the case_when
library(dplyr)
df %>%
mutate(count = case_when(if_any(starts_with("time"), ~ .x== 0) ~
max.col(pick(starts_with("time")) ==0, "first")))
-output
id time_1 time_2 time_3 count
1 1 1.0 0.1 0.0 3
2 2 0.9 0.4 0.5 NA
3 3 0.2 0.0 0.3 2
4 4 0.0 0.9 1.0 1
You can do this:
df <- df %>%
rowwise() %>%
mutate (count = which(c_across(starts_with("time")) == 0)[1])
df
id time_1 time_2 time_3 count
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1 0.1 0 3
2 2 0.9 0.4 0.5 NA
3 3 0.2 0 0.3 2
4 4 0 0.9 1 1

Create proportion matrix for multivariate categorical data

Suppose I've got this data simulated from the below R code:
library(RNGforGPD)
set.seed(1)
sample.size = 10; no.gpois = 3
lambda.vec = c(-0.2, 0.2, -0.3); theta.vec = c(1, 3, 4)
M = c(0.352, 0.265, 0.342); N = diag(3); N[lower.tri(N)] = M
TV = N + t(N); diag(TV) = 1
cstar = CmatStarGpois(TV, theta.vec, lambda.vec, verbose = TRUE)
data = GenMVGpois(sample.size, no.gpois, cstar, theta.vec, lambda.vec, details = FALSE)
> prop.table(table(data[,1]))
0 1 2
0.3 0.4 0.3
> prop.table(table(data[,2]))
2 3 6 8 10
0.2 0.4 0.1 0.2 0.1
> prop.table(table(data[,3]))
2 3 4 5 6
0.2 0.3 0.1 0.3 0.1
> table(data)
data
0 1 2 3 4 5 6 8 10
3 4 7 7 1 3 2 2 1
I'd like to create a proportion matrix for each of the three categorical variables. If the category is missing for a specific column, it will be identified as 0.
Cat
X1
X2
X3
0
0.3
0.0
0.0
1
0.4
0.0
0.0
2
0.3
0.2
0.2
3
0.0
0.4
0.3
4
0.0
0.0
0.1
5
0.0
0.0
0.3
6
0.0
0.1
0.1
8
0.0
0.2
0.0
10
0.0
0.1
0.0
This is the data-object:
dput(data)
structure(c(1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 3, 8, 3, 3, 2, 2, 6,
3, 10, 8, 2, 5, 2, 6, 3, 3, 4, 3, 5, 5), .Dim = c(10L, 3L), .Dimnames = list(
NULL, NULL))
Tried to put logic at appropriate points in code sequence.
props <- data.frame(Cat = sort(unique(c(data))) ) # Just the Cat column
#Now fill in the entries
# the entries will be obtained with table function
apply(data, 2, table) # run `table(.)` over the columns individually
[[1]]
0 1 2 # these are actually character valued names
3 4 3 # while these are the count values
[[2]]
2 3 6 8 10
2 4 1 2 1
[[3]]
2 3 4 5 6
2 3 1 3 1
Now iterate over that list to fill in values that match the Cat column:
props2 <- cbind(props, # using dfrm first argument returns dataframe object
lapply( apply(data, 2, table) , # irregular results are a list
function(col) { # first make a named vector of zeros
x <- setNames(rep(0,length(props$Cat)), props$Cat)
# could have skipped that step by using `tabulate`
# then fill with values using names as indices
x[names(col)] <- col # values to matching names
x}) )
props2
#-------------
Cat V1 V2 V3
0 0 3 0 0
1 1 4 0 0
2 2 3 2 2
3 3 0 4 3
4 4 0 0 1
5 5 0 0 3
6 6 0 1 1
8 8 0 2 0
10 10 0 1 0
#---
# now just "proportionalize" those counts
props2[2:4] <- prop.table(data.matrix(props2[2:4]), margin=2)
props2
#-------------
Cat V1 V2 V3
0 0 0.3 0.0 0.0
1 1 0.4 0.0 0.0
2 2 0.3 0.2 0.2
3 3 0.0 0.4 0.3
4 4 0.0 0.0 0.1
5 5 0.0 0.0 0.3
6 6 0.0 0.1 0.1
8 8 0.0 0.2 0.0
10 10 0.0 0.1 0.0
colnames(data) <- c("X1", "X2", "X3")
as_tibble(data) %>%
pivot_longer(cols = "X1":"X3", values_to = "Cat") %>%
group_by(name, Cat) %>%
count() %>%
ungroup(Cat) %>%
summarize(name, Cat, proportion = n / sum(n)) %>%
pivot_wider(names_from = name, values_from = proportion) %>%
arrange(Cat) %>%
replace(is.na(.), 0)
# A tibble: 9 × 4
Cat X1 X2 X3
<dbl> <dbl> <dbl> <dbl>
1 0 0.3 0 0
2 1 0.4 0 0
3 2 0.3 0.2 0.2
4 3 0 0.4 0.3
5 4 0 0 0.1
6 5 0 0 0.3
7 6 0 0.1 0.1
8 8 0 0.2 0
9 10 0 0.1 0
If you would like it as a matrix, you can use as.matrix()

pivot_wider() producing NA for previously full data

I wonder why when I try to turn my data.frame into wide format, the two columns Y1 & Y2 contain NA?
The dataset originally had no NA on its Y1 and Y2. Is there a fix?
library(tidyverse)
dat <- read.csv("https://raw.githubusercontent.com/rnorouzian/v/main/mvmm.csv")
pivot_wider(dat, names_from= DV, values_from = Response)
# School Student Treat Gender Pretest MeanPretest TXG Index1 D1 D2 TreatCAT Gendercat Y1 Y2
# <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 -0.5 -0.5 48.3 45.6 0.25 1 1 0 0 -0.5 29.4 NA
# 2 1 1 -0.5 -0.5 48.3 45.6 0.25 2 0 1 0 -0.5 NA 47.4
# 3 1 2 -0.5 0.5 52.1 45.6 -0.25 1 1 0 0 0.5 52.2 NA
I think you have duplicated columns, If you change your code like this, it should work, The columns DV, D2, D1, Index1 contains either similar pattern or exact contrast pattern, they should be reshaped together, otherwise, rows are getting duplicated while it is being translated to wider form.We can check that by taking dim of your original table: 1600 rows, if widen properly it should have lower number of records, with below code, its converted to 800. With the OP code it was still at 1600.
library(tidyverse)
dat %>%
pivot_wider(names_from= c(DV,D2,D1,Index1), values_from = Response)
Output:
School Student Treat Gender Pretest MeanPretest TXG TreatCAT
1 1 1 -0.5 -0.5 48.34437943 45.62666702 0.25 0
2 1 2 -0.5 0.5 52.14841080 45.62666702 -0.25 0
3 1 3 -0.5 -0.5 40.56079483 45.62666702 0.25 0
4 1 4 -0.5 0.5 63.11892700 45.62666702 -0.25 0
5 1 5 -0.5 -0.5 66.79794312 45.62666702 0.25 0
6 1 6 -0.5 0.5 19.42481995 45.62666702 -0.25 0
Gendercat Y1_0_1_1 Y2_1_0_2
1 -0.5 29.36377525 47.35104752
2 0.5 52.20915985 49.77211761
3 -0.5 42.21330261 36.21236038
4 0.5 46.69318008 63.72433472
5 -0.5 48.70760345 48.04736328
6 0.5 23.40506554 11.07947922
Try this:
dat %>%
select(-c(Index1, D1, D2)) %>%
pivot_wider(names_from = DV, values_from = Response)
This is happening because Index1, D1, and D2 are all the same, and also correspond to the column you want to pivot by. If you get rid of them it works fine

Min-max normalization in R, setting groups of min and max based on another column

Using R, I am trying to min-max normalize a column, but instead of using the min and max of all the column values, I need to set min and max by groups that are determined by another column.
Please see this example:
x <- c(0, 0.5, 1, 2.5, 0.2, 0.3, 0.5, 0,0,0.1, 0.7)
y <- c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3)
df <- data.frame (x, y)
df
For y=1, min(x) = 0, and max(x) = 2.5.
For y=2, min(x) = 0.2, and max(x) = 0.5, and so forth.
Based on this grouped min and max, normalization is performed.
I found a similar question for Python, but it didnt help me much:
Normalize a column of dataframe using min max normalization based on groupby of another column
library(tidyverse)
df %>%
group_by(y) %>%
mutate(xnorm = (x - min(x)) / (max(x) - min(x))) %>%
ungroup()
Output:
# A tibble: 11 x 3
x y xnorm
<dbl> <dbl> <dbl>
1 0 1 0
2 0.5 1 0.2
3 1 1 0.4
4 2.5 1 1
5 0.2 2 0
6 0.3 2 0.333
7 0.5 2 1
8 0 3 0
9 0 3 0
10 0.1 3 0.143
11 0.7 3 1
Or, in the mutate() statement, you could put xnorm = scales::rescale(x)
You can use function aggregate
aggregate(x, list(y), min)
Group.1 x
1 1 0.0
2 2 0.2
3 3 0.0
aggregate(x, list(y), max)
Group.1 x
1 1 2.5
2 2 0.5
3 3 0.7
# You can create your own function like this
myFun <- function (u) {
c(min(u), mean(u), max(u))
}
# and pass myFun to aggregate
aggregate(x, list(y), myFun)
Group.1 x.1 x.2 x.3
1 1 0.0000000 1.0000000 2.5000000
2 2 0.2000000 0.3333333 0.5000000
3 3 0.0000000 0.2000000 0.7000000
# alternative is "by" different output format
by(x, list(y), myFun)
I am not sure if you need something like below
dfout <- within(df,xnorm <- ave(x,y,FUN = function(v) (v-min(v))/diff(range(v))))
such that
> dfout
x y xnorm
1 0.0 1 0.0000000
2 0.5 1 0.2000000
3 1.0 1 0.4000000
4 2.5 1 1.0000000
5 0.2 2 0.0000000
6 0.3 2 0.3333333
7 0.5 2 1.0000000
8 0.0 3 0.0000000
9 0.0 3 0.0000000
10 0.1 3 0.1428571
11 0.7 3 1.0000000

for-loop inside mutate and append result

I have a simple for-loop which works as I would like on vectors, I would like to use my for-loop on a column of a dataframe grouped by another column in the dataframe e.g.:
# here is my for-loop working as expected on a simple vector:
vect <- c(0.5, 0.7, 0.1)
res <- vector(mode = "numeric", length = 3)
for (i in 1:length(vect)) {
res[i] <- sum(exp(-2 * (vect[i] - vect[-i])))
}
res
[1] 1.9411537 0.9715143 5.5456579
And here is psuedo-code trying to do it on a column of a dataframe:
#Example data
my.df <- data.frame(let = rep(LETTERS[1:3], each = 3),
num1 = 1:3, vect = c(0.5, 0.7, 0.1), num3 = NA)
my.df
let num1 vect num3
1 A 1 0.5 NA
2 A 2 0.7 NA
3 A 3 0.1 NA
4 B 1 0.5 NA
5 B 2 0.7 NA
6 B 3 0.1 NA
7 C 1 0.5 NA
8 C 2 0.7 NA
9 C 3 0.1 NA
# My attempt:
require(tidyverse)
my.df <- my.df %>%
group_by(let) %>%
mutate(for (i in 1:length(vect)) {
num3[i] <- sum(exp(-4 * (vect[i] - vect[-i])))
})
What result should look like (but my psuedo code above doesn't work):
let num1 vect num3
1 A 1 0.5 1.9411537
2 A 2 0.7 0.9715143
3 A 3 0.1 5.5456579
4 B 1 0.5 1.9411537
5 B 2 0.7 0.9715143
6 B 3 0.1 5.5456579
7 C 1 0.5 1.9411537
8 C 2 0.7 0.9715143
9 C 3 0.1 5.5456579
I feel like I am not using tidyverse logic by trying to having a for-loop inside mutate, any suggestions much appreciated.
The simple solution is to create a custom function and pass that to mutate. A working solution:
custom_func <- function(vec) {
res <- vector(mode = "numeric", length = 3)
for (i in 1:length(vect)) {
res[i] <- sum(exp(-2 * (vect[i] - vect[-i])))
}
res
}
library(tidyverse)
my.df %>%
group_by(let) %>%
mutate(num3 = custom_func(vect))
#> # A tibble: 9 x 4
#> # Groups: let [3]
#> let num1 vect num3
#> <fct> <int> <dbl> <dbl>
#> 1 A 1 0.5 1.94
#> 2 A 2 0.7 0.972
#> 3 A 3 0.1 5.55
#> 4 B 1 0.5 1.94
#> 5 B 2 0.7 0.972
#> 6 B 3 0.1 5.55
#> 7 C 1 0.5 1.94
#> 8 C 2 0.7 0.972
#> 9 C 3 0.1 5.55
I'm wondering whether a more elegant version of the custom function is possible - perhaps someone smarter than me can tell you whether purrr::map, for example, could provide an alternative.
We can use map_dbl from purrr and apply the formula for calculation.
library(dplyr)
library(purrr)
my.df %>%
group_by(let) %>%
mutate(num3 = map_dbl(seq_along(vect), ~ sum(exp(-2 * (vect[.] - vect[-.])))))
# let num1 vect num3
# <fct> <int> <dbl> <dbl>
#1 A 1 0.5 1.94
#2 A 2 0.7 0.972
#3 A 3 0.1 5.55
#4 B 1 0.5 1.94
#5 B 2 0.7 0.972
#6 B 3 0.1 5.55
#7 C 1 0.5 1.94
#8 C 2 0.7 0.972
#9 C 3 0.1 5.55
You can turn your for-loop into a sapply-call and then use it in mutate.
sapply takes a function and aplys it to each list-element. In this case I'm looping over the number of elements in each groups (n()).
my.df %>%
group_by(let) %>%
mutate(num3 = sapply(1:n(), function(i) sum(exp(-2 * (vect[i] - vect[-i])))))
# A tibble: 9 x 4
# Groups: let [3]
# let num1 vect num3
# <fct> <int> <dbl> <dbl>
# 1 A 1 0.5 1.94
# 2 A 2 0.7 0.972
# 3 A 3 0.1 5.55
# 4 B 1 0.5 1.94
# 5 B 2 0.7 0.972
# 6 B 3 0.1 5.55
# 7 C 1 0.5 1.94
# 8 C 2 0.7 0.972
# 9 C 3 0.1 5.55
This is essential equivalent to the very wrong looking for-loop inside a mutate call. In this case, however I'd prefer the custom-function provided by A. Stam.
my.df %>%
group_by(let) %>%
mutate(num3 = {
res <- numeric(length = n())
for (i in 1:n()) {
res[i] <- sum(exp(-2 * (vect[i] - vect[-i])))
}
res
})
You can also replace sapply with purrr's map_dbl.
Or using data.table
library(data.table)
setDT(my.df)[, num3 := unlist(lapply(seq_len(.N),
function(i) sum(exp(-2 * (vect[i] - vect[-i]))))), let]
my.df
# let num1 vect num3
#1: A 1 0.5 1.9411537
#2: A 2 0.7 0.9715143
#3: A 3 0.1 5.5456579
#4: B 1 0.5 1.9411537
#5: B 2 0.7 0.9715143
#6: B 3 0.1 5.5456579
#7: C 1 0.5 1.9411537
#8: C 2 0.7 0.9715143
#9: C 3 0.1 5.5456579

Resources