I have a data frame. Each row is a separate person. I need to create a data frame that only shows the latest "date" and "salary" per row. Below is an example of the data frame I'm starting with:
example_df <- tribble(
~person_id, ~date1, ~date2, ~date3, ~salaary1, ~salary2, ~salary3,
1, 2010, 2013, 2015, 100, 200, 300,
2, 1998, NA, NA, 50, NA, NA,
3, 2000, 2001, NA, 100, 200, NA,
4, 1987, 1989, 2005, 50, 300, 500
)
This is what I need the data frame to look like after processing:
example_clean_df <- tribble(
~person_id, ~date, ~salaary,
1, 2015,300,
2, 1998, 50,
3, 2001, 200,
4, 2005, 500
)
Any ideas would be super helpful. Thank you!
Does this work:
library(dplyr)
example_df %>%
rowwise() %>%
mutate(date = max(date1, date2, date3, na.rm = 1),
salary = max(salaary1, salary2, salary3, na.rm = 1)) %>%
select(person_id, date, salary)
# A tibble: 4 × 3
# Rowwise:
person_id date salary
<dbl> <dbl> <dbl>
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
Use pivot_longer and slice_max:
library(dplyr)
library(tidyr)
example_df %>%
pivot_longer(-person_id, names_pattern = "(date|salary)(\\d)", names_to = c(".value", "number")) %>%
group_by(person_id) %>%
slice_max(salary) %>%
select(-number)
output
# A tibble: 4 × 3
# Groups: person_id [4]
person_id date salary
<dbl> <dbl> <dbl>
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
The primitive base r / for loop version:
result.df <- list()
for(i in seq(nrow(example_df))){
result.df[[i]] <- cbind(example_df[1][i,],
max(example_df[,2:4][i,], na.rm = T),
max(example_df[,5:7][i,], na.rm = T))
}
result.df <- setNames(do.call(rbind, result.df), c('person_id', 'date', 'salary'))
person_id date salary
1 1 2015 300
2 2 1998 50
3 3 2001 200
4 4 2005 500
PS:
microbenchmark suggests using #Karthik S method, as it is the fastest.
# test1: loop base R
# test2: rowwise mutate
# test3: pivot_longer
min lq mean median uq max neval
9.9957 10.41815 11.858530 10.86845 11.97645 21.8334 100
7.6594 7.96195 9.457389 8.29315 9.49365 25.9524 100
12.0949 12.49685 14.080567 12.83050 13.85300 26.6272 100
PPS:
using lapply speeds up the process:
result.df <- lapply(seq(nrow(example_df)), \(x) cbind(example_df[1][x,],
max(example_df[,2:4][x,], na.rm = T),
max(example_df[,5:7][x,], na.rm = T))) %>%
do.call(rbind, .) %>%
setNames(c('person_id', 'date', 'salary'))
min lq mean median uq max neval
3.6828 3.89075 5.754244 4.41195 7.14130 14.0325 100
Related
I have the following dataset:
df_x <- data.frame(year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002),
a = c(7, 3, 5),
b = c(5, 8, 1),
c = c(8, 4, 3))
and this vector:
v <- c("a", "b", "c")
Now I want to create a new dataset and summarise a, b, and c by creating new variables (y_a, y_b, and y_c) that calculate the mean of each variable grouped by year.
The code for doing this is the following:
y <- df_x %>% group_by(year) %>% dplyr::summarise(y_a = mean(a, na.rm = TRUE),
y_b = mean(b, na.rm = TRUE),
y_c = mean(c, na.rm = TRUE))
However, I want to use the vector v to read the respective variable from it and paste in into the summarise function:
y <- df_x %>% group_by(year) %>% dplyr::summarise(as.name(paste0("y_", v[1])) = mean(as.name(v[1]), na.rm = TRUE),
as.name(paste0("y_", v[2])) = mean(as.name(v[1]), na.rm = TRUE),
as.name(paste0("y_", v[3])) = mean(as.name(v[1]), na.rm = TRUE))
Doing so, I receive the following error message:
Error: unexpected '=' in "y <- df_x %>% group_by(year) %>% dplyr::summarise(as.name(paste0("y_", v[1])) ="
How can I paste the value of a vector in this summarise function so that it works?
To define a new variable on the left hand side, you need := instead of =. Because you create it with paste0, you need !! to inject the expression and make sure that is correctly evaluated. To access existing columns in dplyr with a string stored in a variable, using .data is the easiest way.
library(dplyr)
df_x <- data.frame(year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002),
a = c(7, 3, 5),
b = c(5, 8, 1),
c = c(8, 4, 3))
v <- c("a", "b", "c")
df_x %>% group_by(year) %>%
dplyr::summarise(!!paste0("y_", v[1]) := mean(.data[[v[1]]], na.rm = TRUE),
!!paste0("y_", v[2]) := mean(.data[[v[1]]], na.rm = TRUE),
!!paste0("y_", v[3]) := mean(.data[[v[1]]], na.rm = TRUE))
#> # A tibble: 3 × 4
#> year y_a y_b y_c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 2000 5 5 5
#> 2 2001 5 5 5
#> 3 2002 5 5 5
Created on 2022-12-21 by the reprex package (v1.0.0)
Here is a one-liner via base R,
aggregate(. ~ year, cbind.data.frame(year = df_x$year, df_x[v]), FUN = \(i)mean(i, na.rm = TRUE))
year a b c
1 2000 5 4.666667 5
2 2001 5 4.666667 5
3 2002 5 4.666667 5
It would be easier with across and modifying the names with .names
library(dplyr)
df_x %>%
group_by(year) %>%
summarise(across(all_of(v), ~ mean(.x, na.rm = TRUE), .names = "y_{.col}"))
-output
# A tibble: 3 × 4
year y_a y_b y_c
<dbl> <dbl> <dbl> <dbl>
1 2000 5 4.67 5
2 2001 5 4.67 5
3 2002 5 4.67 5
INPUT:-
year max1 max2 max3
2001 10 101 87
2002 103 19 88
2003 21 23 89
2004 27 28 91
OUTPUT:-
YEAR MAX
2001 101
2002 103
2003 89
2004 91
A dplyr solution:
Data:
df <- fread(" year max1 max2 max3
2001 10 101 87
2002 103 19 88
2003 21 23 89
2004 27 28 91 ")
Code:
library(dplyr)
df %>%
rowwise() %>%
mutate(MAX = max(max1, max2, max3)) %>%
select(year, MAX)
Output:
# A tibble: 4 x 2
# Rowwise:
year MAX
<int> <int>
1 2001 101
2 2002 103
3 2003 89
4 2004 91
A vectorized dplyr solution with pmax(), which facilitates tidyselection while avoiding rowwise() inefficiency:
Solution
Once you've set everything up
library(dplyr)
# ...
# Code to generate 'your_data'.
# ...
here is your solution for the columns {max1, max2, max3}
your_data %>% transmute(YEAR = year, MAX = pmax(max1, max2, max3))
and more generally for all columns of the form max*:
your_data %>% transmute(YEAR = year, MAX = do.call(pmax, unname(across(
# Tidy selection of 'max*' columns:
starts_with("max")
))))
At your discretion, you can replace starts_with() with another selection helper like matches("^max\\d+$").
Results
Given your_data reproduced here
your_data <- structure(
list(
year = c(2001, 2002, 2003, 2004),
max1 = c(10, 103, 21, 27),
max2 = c(101, 19, 23, 28),
max3 = c(87, 88, 89, 91)
),
row.names = c(NA, -4L),
class = "data.frame"
)
this tidy workflow should yield the following data.frame:
YEAR MAX
1 2001 101
2 2002 103
3 2003 89
4 2004 91
This should do it:
OUTPUT = data.frame(
YEAR = INPUT$year,
MAX = apply(INPUT[-1], 1, max)
)
Here's a slightly different option using dplyr::c_across() which gives you handy access to tidyselect semantics
library(tidyverse)
d <- structure(list(year = 2001:2004, max1 = c(10L, 103L, 21L, 27L), max2 = c(101L, 19L, 23L, 28L), max3 = c(87L, 88L, 89L, 91L)), class = "data.frame", row.names = c(NA, -4L))
d %>%
rowwise() %>%
mutate(max = max(c_across(starts_with("max"))), .keep = "unused") %>%
ungroup()
#> # A tibble: 4 x 2
#> year max
#> <int> <int>
#> 1 2001 101
#> 2 2002 103
#> 3 2003 89
#> 4 2004 91
Benchmarking
For what it's worth there are some performance differences which would probably only matter if your dataset is very large but worth noting. The solution from #Gregor Thomas is by far the fastest.
library(microbenchmark)
microbenchmark(
# Dan Adams
c_across = d %>%
rowwise() %>%
mutate(max = max(c_across(starts_with("max"))), .keep = "unused") %>%
ungroup(),
# MonJeanJean
max_all = d %>%
rowwise() %>%
mutate(MAX = max(max1, max2, max3)) %>%
select(year, MAX),
# Greg
do.call = d %>%
transmute(YEAR = year, MAX = do.call(pmax, unname(across(starts_with(
"max"
))))),
# Gregor Thomas
apply = data.frame(
year = d$year,
max = apply(d[-1], 1, max))
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> c_across 6928.9 8123.20 9548.682 9187.00 10709.95 17595.6 100 c
#> max_all 7890.5 9016.35 10473.327 10176.90 11366.65 16389.5 100 d
#> do.call 3392.3 3976.20 4609.419 4473.55 4981.30 9282.5 100 b
#> apply 349.1 470.20 567.896 535.05 670.70 1017.7 100 a
Created on 2022-02-08 by the reprex package (v2.0.1)
I have the following two data frames:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
Say I assign category A to Column1 and and category C to Column2. I want to multiply the row value from Column1 with the row betas from category A, if the dates match. Similarly, I want to multiply the row value from Column2 with the row betas from category C, if the dates match.
The match between a category and a column is of my own choosing. Assigning this myself won’t be a problem I think because I have relatively few columns.
Preferably, I want the output to look like this:
results <- data.frame(Date = c(2001, 2002, 2003),
Column1_categoryA_beta1 = c(10, 60, 120),
Column1_categoryA_beta2 = c(20, 80, 180),
Column2_categoryC_beta1 = c(200, 150, 60),
Column2_categoryC_beta2 = c(200, 200, 120))
Any help in how I best can approach this problem is very much appreciated!
With some data wrangling using tidyr and dplyr this can be achieved like so:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
library(dplyr)
library(tidyr)
df2_long <- df2 %>%
pivot_longer(-Date, names_to = "Column", values_to = "Value") %>%
mutate(Category = ifelse(Column == "Column1", "A", "C"))
df2_long %>%
left_join(df1) %>%
mutate(Beta1 = Value * Beta1,
Beta2 = Value * Beta2) %>%
select(Date, Category, Column, Beta1, Beta2) %>%
pivot_wider(id_cols = Date, names_from = c("Column", "Category"), values_from = c("Beta1", "Beta2"))
#> Joining, by = c("Date", "Category")
#> Warning: Column `Category` joining character vector and factor, coercing into
#> character vector
#> # A tibble: 3 x 5
#> Date Beta1_Column1_A Beta1_Column2_C Beta2_Column1_A Beta2_Column2_C
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2001 10 200 20 200
#> 2 2002 60 150 80 200
#> 3 2003 120 60 180 120
Created on 2020-04-14 by the reprex package (v0.3.0)
One way to get there while keeping the Category variable in the final data frame is the following:
df3 <- left_join(df1, df2, by="Date")
df4 <- df3 %>%
group_by(Date, Category) %>%
mutate(Col1Bet1 = Column1 * Beta1, Col1Bet2 = Column1 * Beta2, Col2Bet1 = Column2 * Beta1, Col2Bet2 = Column2 * Beta2)
which gives the following:
# A tibble: 9 x 10
# Groups: Date, Category [9]
Category Date Beta1 Beta2 Column1 Column2 Col1Bet1 Col1Bet2 Col2Bet1 Col2Bet2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 2001 1 2 10 40 10 20 40 80
2 A 2002 3 4 20 50 60 80 150 200
3 A 2003 4 6 30 60 120 180 240 360
4 B 2001 4 1 10 40 40 10 160 40
5 B 2002 5 1 20 50 100 20 250 50
6 B 2003 3 2 30 60 90 60 180 120
7 C 2001 5 5 10 40 50 50 200 200
8 C 2002 3 4 20 50 60 80 150 200
9 C 2003 1 2 30 60 30 60 60 120
This could be a start. The result data.table has all information you want just in another format.
df3 <- merge(df1, df2)
df3$b1 <- ifelse(df3$Category=="A", df3$Beta1*df3$Column1, ifelse(df3$Category=="C", df3$Beta1*df3$Column2, NA))
df3$b2 <- ifelse(df3$Category=="A", df3$Beta2*df3$Column1, ifelse(df3$Category=="C", df3$Beta2*df3$Column2, NA))
# Date Category Beta1 Beta2 Column1 Column2 b1 b2
# 1 2001 A 1 2 10 40 10 20
# 2 2001 C 5 5 10 40 200 200
# 3 2001 B 4 1 10 40 NA NA
# 4 2002 A 3 4 20 50 60 80
# 5 2002 B 5 1 20 50 NA NA
# 6 2002 C 3 4 20 50 150 200
# 7 2003 B 3 2 30 60 NA NA
# 8 2003 A 4 6 30 60 120 180
# 9 2003 C 1 2 30 60 60 120
I have a dataframe that looks like this:
group <- c('a', 'b', 'a', 'b')
year <- c(1990, 1990, 2000, 2000)
freq <- c(100, 120, 130, 170)
df <- data.frame(group, year, freq)
For each distinct year, I'd like to find the freq value for the row with group a divided by the freq value for the row with group b and add these proportion values to the dataframe. The resulting dataframe should look like this:
group <- c('a', 'b', 'c', 'a', 'b', 'c')
year <- c(1990, 1990, 1990, 2000, 2000, 2000)
freq <- c(100, 120, 100/120, 130, 170, 130/170)
df <- data.frame(group, year, freq)
I tried to get this going with the ugliest of loops below but have taken the train off of the rails. If anyone can help show me how to accomplish this elementary task in R, I'd be grateful!
for (year in unique(df$year)) {
a = df[ which(df$group == 'a' & df$year == year), ]
b = df[ which(df$group == 'b' & df$year == year), ]
proportion = a$freq / b$freq
row = c('c', year, proportion)
rbind(df, row)
}
Here is a tidyverse option
library(tidyverse)
df %>%
spread(group, freq) %>%
mutate(c = a / b) %>%
gather(group, freq, -year) %>%
arrange(year, group)
# year group freq
#1 1990 a 100.0000000
#2 1990 b 120.0000000
#3 1990 c 0.8333333
#4 2000 a 130.0000000
#5 2000 b 170.0000000
#6 2000 c 0.7647059
Explanation: We spread data from long to wide, add a column c = a / b and gather data from wide to long before re-ordering rows to reproduce your expected output.
Split original by year with function split (result is a list).
foo <- split(df, df$year)
For each entry in list foo bind original entry x with new data.frame which has calculated freq
bar <- lapply(foo, function(x)
rbind(x, data.frame(group = "c",
year = x$year[1],
freq = x$freq[1] / x$freq[2])))
# Bind back final result as it's a list (lapply result)
do.call(rbind, bar)
Here is an option using data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'year', concatenate 'group' with 'c' as well as 'freq' with the ratio of 'freq' elements correspondingly
library(data.table)
setDT(df)[, .(group = c(group, 'c'), freq = c(freq, freq[1]/freq[2])), .(year)]
# year group freq
#1: 1990 a 100.0000000
#2: 1990 b 120.0000000
#3: 1990 c 0.8333333
#4: 2000 a 130.0000000
#5: 2000 b 170.0000000
#6: 2000 c 0.7647059
Or rbind the summarised dataset with the original
rbind(setDT(df), df[, .(freq = Reduce(`/`, freq), group = 'c'), .(year)])
Or using tidyverse
library(tidyverse)
df %>%
group_by(year) %>%
summarise(group = list(c(group, 'c')),
freq = list(c(freq, freq[1]/freq[2]))) %>%
unnest
# A tibble: 6 x 3
# year group freq
# <dbl> <chr> <dbl>
#1 1990 a 100
#2 1990 b 120
#3 1990 c 0.833
#4 2000 a 130
#5 2000 b 170
#6 2000 c 0.765
data
df <- structure(list(group = c("a", "b", "a", "b"), year = c(1990,
1990, 2000, 2000), freq = c(100, 120, 130, 170)), row.names = c(NA,
-4L), class = "data.frame")
How can I get Cumsum table grouped by both Gender and State?
Gender = sample(c('male', 'female'), 100, replace=TRUE)
State = sample(c('CA', 'WA', 'NV', 'OR', "AZ"), 100, replace=TRUE)
Number = sample(1:8, size=100, replace=TRUE)
df <- data.frame(Gender,State, Number)
If we are looking for cumsum table, then
library(data.table)
dcast(setDT(df)[, .N, .(Gender, State, Number)
][, perc := round(100*N/sum(N), 2), .(Gender, State)],
Gender + State ~Number, value.var = 'perc', fill = 0, drop = FALSE)[,
(3:10) := lapply(Reduce(`+`, .SD, accumulate = TRUE),
function(x) paste0(x, "%")), .SDcols = -(1:2)][]
For a simpler approach, I would recommend using dplyr. Dplyr is loaded along with a bunch of other helpful packages when you load tidyverse.
library(tidyverse)
Gender = sample(c('male', 'female'), 100, replace=TRUE)
State = sample(c('CA', 'WA', 'NV', 'OR', "AZ"), 100, replace=TRUE)
Number = sample(1:8, size=100, replace=TRUE)
df <- data.frame(Gender,State, Number)
df <- df %>%
group_by(Gender, State) %>%
mutate(Number_CumSum = cumsum(Number)) %>%
ungroup() %>%
arrange(State, Gender)
head(df)
# A tibble: 6 x 4
Gender State Number Number_CumSum
<fctr> <fctr> <int> <int>
1 female AZ 8 8
2 female AZ 3 11
3 female AZ 4 15
4 female AZ 5 20
5 female AZ 2 22
6 female AZ 7 29