How to make an aggregation function based on calculation between 2 columns? - r

My dataframe is
df <- data.frame(x = c(4,4,4,2,2,2), y = c(1,2,3,1,2,3), y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
I want to have an aggregation df with 2 columns of y and z with
z = sum(x*y_share)/sum(y_share).
In this case, the resulted dataframe should be like this:
result = data.frame(y = c(1,2,3), z = c(2.57, 3.14, 3))
I tried this
func = function(x) {y=sum(vector(x[1])*vector(x[3]))/sum(vector(x[3]))
return(y)}
agg = aggregate(df, by=list(df$y), FUN=func)
but it doesn't work.
Thank you

We can use data.table
library(data.table)
setDT(df)[, .(z = sum(x * y_share)/sum(y_share)), by = y]
# y z
#1: 1 2.571429
#2: 2 3.142857
#3: 3 3.000000
Or if we want to use base R, here is an option with by
stack(by(df, list(df$y), FUN = function(z)
with(z, sum(x * y_share)/sum(y_share))))[2:1]
data
df <- data.frame(x=c(4,4,4,2,2,2), y=c(1,2,3,1,2,3),
y_share=c(0.2,0.4,0.2,0.5,0.3,0.2))

Tidyverse approach (using dplyr):
library(dplyr)
result <- df %>%
group_by(y) %>%
summarise(z = sum(x*y_share)/sum(y_share)) %>%
ungroup()
Result
result
# A tibble: 3 x 2
# y z
# <dbl> <dbl>
# 1 1 2.57
# 2 2 3.14
# 3 3 3.
Data
df <- data.frame(x = c(4,4,4,2,2,2),
y = c(1,2,3,1,2,3),
y_share = c(0.2,0.4,0.2,0.5,0.3,0.2))
result <- data.frame(y = c(1,2,3),
z = c(2.57, 3.14, 3))

Related

How to divide only a certain factor in a column in R data frame?

This is the sample dataset:
library(data.table)
df = data.table(x = c(1000,2000,10,2), y = c('A','A','B','B'))
I only want to divide df$y == "A" by 1000. The final dataset should appear as:
df = data.table(x = c(1,2,10,2), y = c('A','A','B','B'))
You need to create a conditional statement.
In base R:
df$x <- ifelse(df$y == "A", df$x/1000, df$x)
In dplyr:
library(dplyr)
df <- df |>
mutate(x = if_else(y == "A", x/1000, x))
data.table option using fifelse like this:
library(data.table)
df = data.table(x = c(1000,2000,10,2), y = c('A','A','B','B'))
df[,x:=fifelse(y == "A", x/1000, x),]
df
#> x y
#> 1: 1 A
#> 2: 2 A
#> 3: 10 B
#> 4: 2 B
Created on 2023-02-18 with reprex v2.0.2
We could use data.table methods as the input is a data.table
library(data.table)
df[y == 'A', x := x/1000]
-output
> df
x y
1: 1 A
2: 2 A
3: 10 B
4: 2 B
Base R: Subsetting with [:
df$x[df$y == "A"] <- df$x[df$y == "A"]/1000
x y
1: 1 A
2: 2 A
3: 10 B
4: 2 B

data frame from a list of vectors

I have 4 vectors (d1,d2,d3,d4) of different lengths from which I create a data frame like this
df <- data.frame(
x = c(
seq_along(d1),
seq_along(d2),
seq_along(d3),
seq_along(d4)
),
y = c(
d1,
d2,
d3,
d4
),
id = c(
rep("d1", times = length(d1)),
rep("d2", times = length(d2)),
rep("d3", times = length(d3)),
rep("d4", times = length(d4))
))
Adding a new vector means adding it in 3 different places, this is what I'd like to avoid.
Ideally I would like to pass d1,d2,d3,d4 into a function that then returns the data frame.
The first steps seems to be to wrap the vectors into a list and name them.
l <- list(d1,d2,d3,d4)
names(l) <- c("d1","d2","d3","d4")
But I am struggling with the 2nd part that probably should be something along the lines of this (pseudo code)
df <- data.frame(
x = flatten(map(l, function(a) seq_along(a))),
y = flatten(l),
id = flatten(map(l, function(a) rep(a.name,times=length(a))))
)
What's the correct way to construct the data frame from the list?
Or is there a better way of doing this?
UPDATE: For demonstrative purposes d1..d4 could be imagined to be
d1 <- pnorm(seq(-2, 2, 0.05))-3
d2 <- pnorm(seq(-3, 3, 0.10))
d3 <- pnorm(seq(-1, 2, 0.05))-4
d4 <- pnorm(seq(-4, 3, 0.15))
You can define a function that takes any number of vectors:
build_df <- function(...)
{
vec_list <- list(...)
df <- data.frame(x = do.call("c", sapply(vec_list, seq_along)),
y = do.call("c", vec_list),
name = do.call("c", sapply(seq_along(vec_list),
function(i) rep(names(vec_list)[i],
length(vec_list[[i]]))))
)
rownames(df) <- seq(nrow(df))
df
}
build_df(d1 = 1:3, d2 = 6:9, bananas = 4:6)
#> x y name
#> 1 1 1 d1
#> 2 2 2 d1
#> 3 3 3 d1
#> 4 1 6 d2
#> 5 2 7 d2
#> 6 3 8 d2
#> 7 4 9 d2
#> 8 1 4 bananas
#> 9 2 5 bananas
#> 10 3 6 bananas
Created on 2020-08-03 by the reprex package (v0.3.0)
Your y can be assembled easily with unlist. I needed a for loop to generate x and id. How about this function?
d1 <- pnorm(seq(-2, 2, 0.05))-3
d2 <- pnorm(seq(-3, 3, 0.10))
d3 <- pnorm(seq(-1, 2, 0.05))-4
d4 <- pnorm(seq(-4, 3, 0.15))
my_list <- list(d1 = d1, d2 = d2, d3 = d3, d4 = d4)
build_df <- function(list) {
names <- names(list)
x <- integer()
id <- character()
for(i in 1:length(list)) {
x <- c(x, seq_along(list[[i]]))
id <- c(id, rep(names[i], length(list[[i]])))
}
y <- unname(unlist(list))
df <- data.frame(x = x, y = y, id = id)
return(df)
}
df <- build_df(my_list)
head(df)
x y id
1 1 -2.977250 d1
2 2 -2.974412 d1
3 3 -2.971283 d1
4 4 -2.967843 d1
5 5 -2.964070 d1
6 6 -2.959941 d1
We could use mget
library(dplyr)
library(tibble)
library(tidyr)
library(data.table)
mget(paste0("d", 1:4)) %>%
enframe(name = 'id', value = 'y') %>%
unnest(c(y)) %>%
mutate(x = rowid(id))

Returning a tibble: how to vectorize with case_when?

I have a function which returns a tibble. It runs OK, but I want to vectorize it.
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
if(xx >= 4){
tibble(x = NA, y = NA)
} else if(xx == 3){
tibble(x = as.integer(), y = as.integer())
} else if (xx == 2){
tibble(x = xx^2 - 1, y = yy^2 -1)
} else {
tibble(x = xx^2, y = yy^2)
}
}
It runs OK in a mutate when I call it with map2, giving me the result I wanted:
tibTest %>%
mutate(sq = map2(argX, argY, square_it)) %>%
unnest()
## A tibble: 3 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 2 6 3 35
# 3 4 4 NA NA
My first attempt to vectorize it failed, and I can see why - I can't return a vector of tibbles.
square_it2 <- function(xx, yy){
case_when(
x >= 4 ~ tibble(x = NA, y = NA),
x == 3 ~ tibble(x = as.integer(), y = as.integer()),
x == 2 ~ tibble(x = xx^2 - 1, y = yy^2 -1),
TRUE ~ tibble(x = xx^2, y = yy^2)
)
}
# square_it2(4, 2) # FAILS
My next attempt runs OK on a simple input. I can return a list of tibbles, and that's what I want for the unnest
square_it3 <- function(xx, yy){
case_when(
xx >= 4 ~ list(tibble(x = NA, y = NA)),
xx == 3 ~ list(tibble(x = as.integer(), y = as.integer())),
xx == 2 ~ list(tibble(x = xx^2 - 1, y = yy^2 -1)),
TRUE ~ list(tibble(x = xx^2, y = yy^2))
)
}
square_it3(4, 2)
# [[1]]
# # A tibble: 1 x 2
# x y
# <lgl> <lgl>
# 1 NA NA
But when I call it in a mutate, it doesn't give me the result I had with square_it. I can sort of see what's
wrong. In the xx == 2 clause, xx acts as an atomic value of 2. But in
building the tibble, xx is a length-4 vector.
tibTest %>%
mutate(sq = square_it3(argX, argY)) %>%
unnest()
# # A tibble: 9 x 4
# argX argY x y
# <int> <int> <dbl> <dbl>
# 1 1 7 1 49
# 2 1 7 4 36
# 3 1 7 9 25
# 4 1 7 16 16
# 5 2 6 0 48
# 6 2 6 3 35
# 7 2 6 8 24
# 8 2 6 15 15
# 9 4 4 NA NA
How do I get the same result as I did with square_it, but from a vectorized function using case_when ?
We define row_case_when which has a similar formula interface as case_when except it has a first argument of .data, acts by row and expects that the value of each leg to be a data frame. It returns a data.frame/tibble. Wrapping in a list, rowwise and unnest are not needed.
case_when2 <- function (.data, ...) {
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
if (n == 0) {
abort("No cases provided")
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang:::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula,
rlang:::default_env, rlang:::current_env())
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, data = .data, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, data = .data, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]])
}
if (query[[i]]) return(value[[i]])
}
}
row_case_when <- function(.data, ...) {
.data %>%
group_by(.group = 1:n(), !!!.data) %>%
do(case_when2(., ...)) %>%
mutate %>%
ungroup %>%
select(-.group)
}
Test run
It is used like this:
library(dplyr)
tibTest <- tibble(argX = 1:4, argY = 7:4) # test data from question
tibTest %>%
row_case_when(argX >= 4 ~ tibble(x = NA, y = NA),
argX == 3 ~ tibble(x = as.integer(), y = as.integer()),
argX == 2 ~ tibble(x = argX^2 - 1, y = argY^2 -1),
TRUE ~ tibble(x = argX^2, y = argY^2)
)
giving:
# A tibble: 3 x 4
argX argY x y
<int> <int> <dbl> <dbl>
1 1 7 1 49
2 2 6 3 35
3 4 4 NA NA
mutate_cond and mutate_when
These are not quite the same as row_case_when since they don't run through conditions taking the first true one but by using mutually exclusive conditions they can be used for certain aspects of this problem. They do not handle changing the number of rows in the result but we can use dplyr::filter to remove rows for a particular condition.
mutate_cond defined in dplyr mutate/replace several columns on a subset of rows is like mutate except the second argument is a condition and the subsequent arguments are applied only to rows for which that condition is TRUE.
mutate_when defined in
dplyr mutate/replace several columns on a subset of rows is similar to case_when except it applies to rows, the replacement values are provided in a list and alternate arguments are conditions and lists. Also all legs are always run applying the replacement values to the rows satisfying the conditions (as opposed to, for each row, performing the replacement on just the first true leg). To get a similar effect to row_case_when be sure that the conditions are mutually exclusive.
# mutate_cond example
tibTest %>%
filter(argX != 3) %>%
mutate(x = NA_integer_, y = NA_integer_) %>%
mutate_cond(argX == 2, x = argX^2 - 1L, y = argY^2 - 1L) %>%
mutate_cond(argX < 2, x = argX^2, y = argY^2)
# mutate_when example
tibTest %>%
filter(argX != 3) %>%
mutate_when(TRUE, list(x = NA_integer_, y = NA_integer_),
argX == 2, list(x = argX^2 - 1L, y = argY^2 - 1L),
argX < 2, list(x = argX^2, y = argY^2))
You need to ensure you are creating a 1-row tibble with each call of the function, then vectorize that.
This works whether you have rowwise groups or not.
You can do this with switch wrapped in a map2:
Here's a reprex:
library(tidyverse)
tibTest <- tibble(argX = 1:4, argY = 7:4)
square_it <- function(xx, yy) {
map2(xx, yy, function(x, y){
switch(which(c(x >= 4,
x == 3,
x == 2,
x < 4 & x != 3 & x != 2)),
tibble(x = NA, y = NA),
tibble(x = as.integer(), y = as.integer()),
tibble(x = x^2 - 1, y = y^2 -1),
tibble(x = x^2, y = y^2))})
}
tibTest %>% mutate(sq = square_it(argX, argY)) %>% unnest(cols = sq)
#> # A tibble: 3 x 4
#> argX argY x y
#> <int> <int> <dbl> <dbl>
#> 1 1 7 1 49
#> 2 2 6 3 35
#> 3 4 4 NA NA
Created on 2020-05-16 by the reprex package (v0.3.0)

Create a new tibble using the previous row value of a column as a parameter of the current row value

I want to manually create a tibble where one column values are calculated depending on the previous value of the same column.
For example:
tibble(
x = 1:5,
y = x + lag(y, default = 0)
)
I expect the following result:
# A tibble: 5 x 2
x y
<int> <dbl>
1 1 1
2 2 3
3 3 6
4 4 10
5 5 15
But I obtain the error:
Error in lag(y, default = 0) : object 'y' not found
Update - more real example:
tibble(
years = 1:5,
salary = 20000 * (1.01) ^ lag(years, default = 0),
qta = salary * 0.06
) %>%
mutate(
total = ifelse(row_number() == 1,
(qta + 50000) * (1.02),
(qta + lag(total, default = 0)) * (1.02))
)
In this example I have a tibble, and I want to add a column 'total' that is defined depending on its previous value, but the lag(total, default = 0) doesn't work.
We can use accumulate
library(tidyverse)
tibble(x = 1:5, y = accumulate(x, `+`))
# A tibble: 5 x 2
# x y
# <int> <int>
#1 1 1
#2 2 3
#3 3 6
#4 4 10
#5 5 15
For a general function, it would be
tibble(x = 1:5, y = accumulate(x, ~ .x + .y))
We can also specify the initialization value
tibble(x = 1:5, y = accumulate(x[-1], ~ .x + .y, .init = x[1]))
You're missing x instead of y in the lag() function to run without an error:
tibble(
x = 1:5,
y = x + lag(x, default = 0)
)
But as per #Ronak Shah's comment, you need the cumsum() function to get the same result as your example:
tibble(
x = 1:5,
y = cumsum(x)
)

Concatenate rows and columns

I have a data set like this
x y z
a 5 4
b 1 2
And i want concat columns and rows :
ay 5
az 4
by 1
bz 2
Thanks
You can use melt, and paste but you will need to make your rownames a variable, i..e
df$new <- rownames(df)
m_df <- reshape2::melt(df)
rownames(m_df) <- paste0(m_df$new, m_df$variable)
m_df <- m_df[-c(1:2)]
m_df
# value
#ax 5
#bx 1
#ay 4
#by 2
#az 3
#bz 1
After your edit, you don't need to convert rownames to a variable so just,
m1_df <- reshape2::melt(df)
m1_df$new <- paste0(m1_df$x, m1_df$variable)
m1_df
# x variable value new
#1 a y 5 ay
#2 b y 1 by
#3 a z 4 az
#4 b z 2 bz
You can then tidy your data frame to required output
with dplyr-tidyr
library(dplyr)
library(tidyr)
df %>%
gather(var, val, -x) %>%
mutate(var=paste0(x, var)) %>%
select(var, val)%>%
arrange(var)
# var val
#1 ay 5
#2 az 4
#3 by 1
#4 bz 2
library(reshape2)
library(dplyr)
library(tibble)
library(stringr)
# Create dataframe
x <- data.frame(x = c(5, 1),
y = c(4, 2),
z = c(3, 1),
row.names = c('a', 'b'))
# Convert rowname to column and melt
x <- tibble::rownames_to_column(x, "rownames") %>%
melt('rownames')
# assign concat columns as rownames
row.names(x) <- str_c(x$rownames, x$variable)
# Select relevant columns only
x <- select(x, value)
# Remove names from dataframe
names(x) <- NULL
> x
ax 5
bx 1
ay 4
by 2
az 3
bz 1
Here is another option in base R
stack(setNames(as.list(unlist(df1[-1])), outer(df1$x, names(df1)[-1], paste0)))[2:1]

Resources