Here is a piece of my data:
data_x <- tribble(
~price, ~bokey, ~id, ~cost, ~revenue,
1, "a", 10, 0.20, 30,
2, "b", 20, 0.30, 60,
3, "c", 20, 0.30, 40,
4, "d", 10, 0.20, 100,
5, "e", 30, 0.10, 40,
6, "f", 10, 0.20, 10,
1, "g", 20, 0.30, 80,
2 , "h", 10, 0.20, 20,
3, "h", 30, 0.10, 20,
3, "i", 20, 0.30, 40,
)
As you see, there are three different type of IDs: 10, 20, 30. But in the real data, there are almost 100 ids. I want to aggregate the data based on these ids. Because I don't know how to do it in loop, I basically created some subsets:
data_10 <- data_x %>% filter(id == 10)
data_20 <- data_x %>% filter(id == 20)
data_30 <- data_x %>% filter(id == 30)
Here is the aggregated data:
data_agg <- data_10 %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
But I want to have one more column that shows the id. Here is the desired data:
data_desired <- tribble(
~id, ~priceseg, ~price_n, ~Cost, ~Revenue, ~clicks, ~price_n2, ~`(zet = Cost/Revenue)`
10, (0,1] 1 0.2 30 1 25 0.00667
10, (1,3] 1 0.2 20 1 25 0.01
10, (3,5] 1 0.2 100 1 25 0.002
10, (5,6] 1 0.2 10 1 25 0.02
20,
20,
.
.
) 30,
How can I get it?
Since you are already using dplyr, just add id as one of the grouping variables (no need to previously separate your data):
data_agg <- data_x %>%
group_by(id, priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
# A tibble: 8 x 8
# Groups: id [3]
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <dbl> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
# 1 10 (0,1] 1 0.2 30 1 25 0.00667
# 2 10 (1,3] 1 0.2 20 1 25 0.01
# 3 10 (3,5] 1 0.2 100 1 25 0.002
# 4 10 (5,6] 1 0.2 10 1 25 0.02
# 5 20 (0,1] 1 0.3 80 1 25 0.00375
# 6 20 (1,3] 3 0.900 140 3 75 0.00643
# 7 30 (1,3] 1 0.1 20 1 50 0.005
# 8 30 (3,5] 1 0.1 40 1 50 0.0025
An option is to split and loop over with map while specifying the .id
library(dplyr)
library(purrr)
data_x %>%
split(.$id) %>%
map_dfr(~
.x %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue)), .id = "id" )
# A tibble: 8 x 8
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <chr> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
#1 10 (0,1] 1 0.2 30 1 25 0.00667
#2 10 (1,3] 1 0.2 20 1 25 0.01
#3 10 (3,5] 1 0.2 100 1 25 0.002
#4 10 (5,6] 1 0.2 10 1 25 0.02
#5 20 (0,1] 1 0.3 80 1 25 0.00375
#6 20 (1,3] 3 0.900 140 3 75 0.00643
#7 30 (1,3] 1 0.1 20 1 50 0.005
#8 30 (3,5] 1 0.1 40 1 50 0.0025
The cut step can also be changed with findInterval
NOTE: The idea of split/map is based on the OP's title about looping and getting the output
Related
Using tidyr, how can I create a new column through a group-by and calculation?
For example, if I have this dataframe:
name <- c("a", "a", "a", "a", "b", "b", "b", "b")
x1 <- c(0, 0, 0, 0, 1, 1, 1, 1)
x2 <- c(15, 15, 15, 15, 15, 15, 15, 15)
y <- c(1, 2, 1, 2, 1, 2, 1, 2)
z <- c(50, 100, 40, 90, 65, 95, 40, 95)
df <- data.frame(name, x1, x2, y, z)
Let's say I want to (1) group-by x1 and x2; (2) find the max z value in that group; and (3) create a new column z2 that normalized z by that maximum.
So in this case, the expected output for z2 is c(0.5, 1, 0.4, 0.9, 0.684, 1, 0.421, 1).
We could simply group by 'x1', 'x2' and create the column with mutate
library(dplyr)
df <- df %>%
group_by(x1, x2) %>%
mutate(z2 = (z/max(z, na.rm = TRUE))) %>%
ungroup
-output
df
# A tibble: 8 × 6
name x1 x2 y z z2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 0 15 1 50 0.5
2 a 0 15 2 100 1
3 a 0 15 1 40 0.4
4 a 0 15 2 90 0.9
5 b 1 15 1 65 0.684
6 b 1 15 2 95 1
7 b 1 15 1 40 0.421
8 b 1 15 2 95 1
I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.
baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}
Title is complicated, but I don't know how to put this problem into words. So I'll demonstrate.
Here's my problem, with the desired output:
library(tibble)
# Input:
tribble(
~n_1, ~n_2, ~n_3, ~pct_1, ~pct_2, ~pct_3,
10, 20, 30, 0.1, 0.2, 0.3
)
#> # A tibble: 1 x 6
#> n_1 n_2 n_3 pct_1 pct_2 pct_3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10 20 30 0.1 0.2 0.3
# Desired output:
tribble(
~name, ~n, ~pct,
1, 10, 0.1,
2, 20, 0.2,
3, 30, 0.3
)
#> # A tibble: 3 x 3
#> name n pct
#> <dbl> <dbl> <dbl>
#> 1 1 10 0.1
#> 2 2 20 0.2
#> 3 3 30 0.3
I tried tidyr::pivot_longer(), but I can't get it right. Is there any way?
One option could be:
df %>%
pivot_longer(everything(),
names_to = c(".value", "name"),
names_pattern = "(.*)_(.)")
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Try this approach. As your main variable is concatenated you can use separate() (using sep='_') after pivot_longer() and then pivot_wider() to obtain the expected dataframe. Here the code:
library(tidyverse)
#Code
df %>% pivot_longer(cols = everything()) %>%
separate(name,into = c('var','name'),sep = '_') %>%
pivot_wider(names_from = var,values_from=value)
Output:
# A tibble: 3 x 3
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Some data used (the one you provided):
#Data
df <- structure(list(n_1 = 10, n_2 = 20, n_3 = 30, pct_1 = 0.1, pct_2 = 0.2,
pct_3 = 0.3), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
I have the following two data frames:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
Say I assign category A to Column1 and and category C to Column2. I want to multiply the row value from Column1 with the row betas from category A, if the dates match. Similarly, I want to multiply the row value from Column2 with the row betas from category C, if the dates match.
The match between a category and a column is of my own choosing. Assigning this myself won’t be a problem I think because I have relatively few columns.
Preferably, I want the output to look like this:
results <- data.frame(Date = c(2001, 2002, 2003),
Column1_categoryA_beta1 = c(10, 60, 120),
Column1_categoryA_beta2 = c(20, 80, 180),
Column2_categoryC_beta1 = c(200, 150, 60),
Column2_categoryC_beta2 = c(200, 200, 120))
Any help in how I best can approach this problem is very much appreciated!
With some data wrangling using tidyr and dplyr this can be achieved like so:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
library(dplyr)
library(tidyr)
df2_long <- df2 %>%
pivot_longer(-Date, names_to = "Column", values_to = "Value") %>%
mutate(Category = ifelse(Column == "Column1", "A", "C"))
df2_long %>%
left_join(df1) %>%
mutate(Beta1 = Value * Beta1,
Beta2 = Value * Beta2) %>%
select(Date, Category, Column, Beta1, Beta2) %>%
pivot_wider(id_cols = Date, names_from = c("Column", "Category"), values_from = c("Beta1", "Beta2"))
#> Joining, by = c("Date", "Category")
#> Warning: Column `Category` joining character vector and factor, coercing into
#> character vector
#> # A tibble: 3 x 5
#> Date Beta1_Column1_A Beta1_Column2_C Beta2_Column1_A Beta2_Column2_C
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2001 10 200 20 200
#> 2 2002 60 150 80 200
#> 3 2003 120 60 180 120
Created on 2020-04-14 by the reprex package (v0.3.0)
One way to get there while keeping the Category variable in the final data frame is the following:
df3 <- left_join(df1, df2, by="Date")
df4 <- df3 %>%
group_by(Date, Category) %>%
mutate(Col1Bet1 = Column1 * Beta1, Col1Bet2 = Column1 * Beta2, Col2Bet1 = Column2 * Beta1, Col2Bet2 = Column2 * Beta2)
which gives the following:
# A tibble: 9 x 10
# Groups: Date, Category [9]
Category Date Beta1 Beta2 Column1 Column2 Col1Bet1 Col1Bet2 Col2Bet1 Col2Bet2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 2001 1 2 10 40 10 20 40 80
2 A 2002 3 4 20 50 60 80 150 200
3 A 2003 4 6 30 60 120 180 240 360
4 B 2001 4 1 10 40 40 10 160 40
5 B 2002 5 1 20 50 100 20 250 50
6 B 2003 3 2 30 60 90 60 180 120
7 C 2001 5 5 10 40 50 50 200 200
8 C 2002 3 4 20 50 60 80 150 200
9 C 2003 1 2 30 60 30 60 60 120
This could be a start. The result data.table has all information you want just in another format.
df3 <- merge(df1, df2)
df3$b1 <- ifelse(df3$Category=="A", df3$Beta1*df3$Column1, ifelse(df3$Category=="C", df3$Beta1*df3$Column2, NA))
df3$b2 <- ifelse(df3$Category=="A", df3$Beta2*df3$Column1, ifelse(df3$Category=="C", df3$Beta2*df3$Column2, NA))
# Date Category Beta1 Beta2 Column1 Column2 b1 b2
# 1 2001 A 1 2 10 40 10 20
# 2 2001 C 5 5 10 40 200 200
# 3 2001 B 4 1 10 40 NA NA
# 4 2002 A 3 4 20 50 60 80
# 5 2002 B 5 1 20 50 NA NA
# 6 2002 C 3 4 20 50 150 200
# 7 2003 B 3 2 30 60 NA NA
# 8 2003 A 4 6 30 60 120 180
# 9 2003 C 1 2 30 60 60 120
I am working with a data set of patients' health state over time.
I would like to compute the data frame of transitions
from the current health state to the next health state.
Here is an example where the health state is measured
only by AFP level and weight.
The health state measurements might look like the following:
x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
day = c(1, 2, 3, 1, 2, 3),
event = c('status', 'status', 'death', 'status', 'status', 'status'),
afp = c(10, 50, NA, 20, 30, 40),
weight = c(100, 105, NA, 200, 200, 200))
The desired output looks like the following:
y <- data.frame(id = c(1, 1, 2, 2),
current_afp = c(10, 50, 20, 30),
current_weight = c(100, 105, 200, 200),
next_event = c('status', 'death', 'status', 'status'),
next_afp = c(50, NA, 30, 40),
next_weight = c(105, NA, 200, 200))
One inefficient way to obtain the output is:
take the cross product of the measurements data frame with itself
keep only rows with matching ids, and day.x + 1 = day.y
rename the columns
Is there a more efficient way to obtain the output?
Note: The real measurements data frame can have more than 10 columns,
so it is not very efficient from a lines of code perspective
to explicitly write
current_afp = x$afp[1:(n-1)],
next_afp = x$afp[2:n]
...
and so on.
You could try:
library(dplyr)
x %>%
mutate_each(funs(lead(.)), -id, -day) %>%
full_join(x, ., by = c("id", "day")) %>%
select(-event.x) %>%
setNames(c(names(.)[1:2],
paste0("current_", sub("\\..*","", names(.)[3:4])),
paste0("next_", sub("\\..*","", names(.)[5:7])))) %>%
group_by(id) %>%
filter(day != last(day))
Which gives:
# id day current_afp current_weight next_event next_afp next_weight
#1 1 1 10 100 status 50 105
#2 1 2 50 105 death NA NA
#3 2 1 20 200 status 30 200
#4 2 2 30 200 status 40 200
Using base R with a split-apply-combine approach
res <- lapply(split(x[-2], x$id), function(y) {
xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"),
paste("next", colnames(y)[-1], sep="_"))
xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
id current_afp current_weight next_event next_afp next_weight
1 1 10 100 status 50 105
2 1 50 105 death NA NA
3 2 20 200 status 30 200
4 2 30 200 status 40 200
Or, an example where not all days are in sequence
x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
day = c(1, 2, 3, 1, 2, 4),
event = c('status', 'status', 'death', 'status', 'status', 'status'),
afp = c(10, 50, NA, 20, 30, 40),
weight = c(100, 105, NA, 200, 200, 200))
x
id day event afp weight
1 1 1 status 10 100
2 1 2 status 50 105
3 1 3 death NA NA
4 2 1 status 20 200
5 2 2 status 30 200
6 2 4 status 40 200
Some of the transitions are NA, which could be removed if desired.
res <- lapply(split(x, x$id), function(y) {
y <- merge(data.frame(id=unique(y$id), day = 1:max(y$day)), y,
by = c("id", "day"), all.x=TRUE)[, -2]
xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"),
paste("next", colnames(y)[-1], sep="_"))
xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
id current_afp current_weight next_event next_afp next_weight
1.1 1 10 100 status 50 105
1.2 1 50 105 death NA NA
2.1 2 20 200 status 30 200
2.2 2 30 200 <NA> NA NA
2.3 2 NA NA status 40 200