I want to generate a list of dataframes and apply the same functions to each of them. I do not know how to do this elegantly without a very large number of lines of code.
From a dataframe df,
id <- c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'e')
x <- rnorm(n = 10, mean = 25, sd = 3)
y <- rnorm(n = 10, mean = 45, sd = 4.5)
z <- rnorm(n = 10, mean = 70000, sd = 10)
type <- c(rep("gold", 2),
rep("silver", 4),
rep("bronze", 4))
df <- data.frame(id, x, y, z, type)
I create a bunch of other datasets with a simple threshold rule based on one variable
df_25 <- df[df$x < 25,]
df_20 <- df[df$x < 20,]
# and so on
I then apply functions to each dataset; I can do this to each dataset individually, or to a list of datasets
# individually
df <- df_18 %>%
dplyr::group_by(id) %>%
dplyr::mutate(nb1= sum(x),
nb2 = sum(x != 25))
# to a list
ls1 <- list(df_25, df_20)
func_1 <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(nb1= sum(x),
nb2 = sum(x != 25))
}
ls1 <- lapply(ls1, function(x) {x[c("id","x")]
<- lapply(x[c("id","x")], func_1)
x})
df_25 <- ls1[[1]]
df_20 <- ls1[[2]]
In any case this takes both a lot of lines and time as I am dealing with very large datasets. How could I simplify and fasten both the generation of datasets with proper recognisable names and the creation of the new variables through the functions defined above?
I did not find proper answer to this dual question yet and would welcome your help!
You could define a threshold vector and lapply your aggregation. In base R this could look like this:
threshold <- c(22, 24, 26)
res <- setNames(lapply(threshold, function(s) {
sst <- df[df$x < s, ]
merge(sst,
with(sst, aggregate(list(nb1=x, nb2=x != 25),
by=list(id=id), sum), by="id"))
}), threshold)
res
# $`22`
# id x y z type nb1 nb2
# 1 a 20.92786 37.61272 69976.23 gold 20.92786 1
# 2 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 3 c 18.58916 46.08353 69985.98 silver 18.58916 1
#
# $`24`
# id x y z type nb1 nb2
# 1 a 22.73948 44.29524 70002.81 gold 43.66734 2
# 2 a 20.92786 37.61272 69976.23 gold 43.66734 2
# 3 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 4 c 18.58916 46.08353 69985.98 silver 18.58916 1
#
# $`26`
# id x y z type nb1 nb2
# 1 a 22.73948 44.29524 70002.81 gold 43.66734 2
# 2 a 20.92786 37.61272 69976.23 gold 43.66734 2
# 3 b 20.64275 38.02056 69997.25 silver 20.64275 1
# 4 c 18.58916 46.08353 69985.98 silver 44.24036 2
# 5 c 25.65120 44.85778 70008.81 bronze 44.24036 2
# 6 d 24.84056 49.22505 69993.87 bronze 24.84056 1
Data
df <- structure(list(id = structure(c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 5L), .Label = c("a", "b", "c", "d", "e"), class = "factor"),
x = c(22.7394803492982, 20.927856140076, 30.2395154764033,
26.6955462205898, 20.6427460111819, 18.589158456851, 25.6511987559726,
24.8405634272769, 28.8534602413068, 26.5376546472448), y = c(44.2952365501829,
37.6127198429065, 45.2842176546081, 40.3835729432985, 38.0205610647157,
46.083525703352, 44.8577760657779, 49.2250487481642, 40.2699166395278,
49.3740993403725), z = c(70002.8091832317, 69976.2314543058,
70000.9974233725, 70011.435897774, 69997.249180665, 69985.9786882474,
70008.8088326676, 69993.8665395223, 69998.7334115052, 70001.2935411788
), type = structure(c(2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L,
1L), .Label = c("bronze", "gold", "silver"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
Using purrr::map to loop over the vector of thresholds
library(dplyr)
library(purrr)
map(c(18,20,25) %>%set_names() , ~ df %>% filter(x<.x) %>%
group_by(id) %>%
mutate(nb1= sum(x),
nb2 = sum(x != 25)))
Or using map_if to apply the calculation for df subset with nrow()>1.
map_if(c(18,20,25) %>%set_names(), ~df %>% filter(x<.x) %>% nrow()>1,
~df %>% filter(x<.x) %>% group_by(id) %>%
mutate(nb1= sum(x),
nb2 = sum(x != 25)), .else = ~NA)
Using tidyverse we can combine all this operations in one chain.
library(tidyverse)
df %>%
group_split(x > 25, keep = FALSE) %>%
map(. %>% group_by(id) %>% mutate(nb1= sum(x),nb2 = sum(x != 25)))
#[[1]]
# A tibble: 6 x 7
# Groups: id [5]
# id x y z type nb1 nb2
# <fct> <dbl> <dbl> <dbl> <fct> <dbl> <int>
#1 a 21.4 42.9 70001. gold 21.4 1
#2 b 18.0 45.3 70005. silver 18.0 1
#3 c 23.3 42.7 70006. bronze 23.3 1
#4 d 23.4 40.9 69990. bronze 46.7 2
#5 d 23.3 41.2 70000. bronze 46.7 2
#6 e 22.3 55.9 69991. bronze 22.3 1
#[[2]]
# A tibble: 4 x 7
# Groups: id [3]
# id x y z type nb1 nb2
# <fct> <dbl> <dbl> <dbl> <fct> <dbl> <int>
#1 a 25.8 40.5 69995. gold 25.8 1
#2 b 28.3 41.5 69996. silver 54.5 2
#3 b 26.3 49.3 69993. silver 54.5 2
#4 c 26.5 44.5 69986. silver 26.5 1
Here, I have split the data into two groups based on value of x,first group is values below 25 and second group is above 25. You might change the logic based on your requirement.
This gives you list of dataframes as output which you can access individually.
data
set.seed(1234)
id <- c('a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd', 'e')
x <- rnorm(n = 10, mean = 25, sd = 3)
y <- rnorm(n = 10, mean = 45, sd = 4.5)
z <- rnorm(n = 10, mean = 70000, sd = 10)
type <- c(rep("gold", 2),rep("silver", 4),rep("bronze", 4))
df <- data.frame(id, x, y, z, type)
Related
I have a tibble where the rows and columns are the same IDs and I would like to take the mean (ignoring the NAs) to make the df symmetrical. I am struggling to see how.
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
I would normally do
A <- as.matrix(data[-1])
(A + t(A))/2
But this does not work because of the NAs.
Edit: below is the expected output.
output <- tibble(group = LETTERS[1:4],
A = c(NA, 12.5, 20, 10),
B = c(12.5, NA, 25, 21),
C = c(20, 25, NA, 12.5),
D = c(10, 21, 12.5, NA))
Here is a suggestion using tidyverse code.
library(tidyverse)
data <- tibble(group = LETTERS[1:4],
A = c(NA, 10, 20, NA),
B = c(15, NA, 25, 30),
C = c(20, NA, NA, 10),
D = c(10, 12, 15, NA)
)
A <- data %>%
pivot_longer(-group, values_to = "x")
B <- t(data) %>%
as.data.frame() %>%
setNames(LETTERS[1:4]) %>%
rownames_to_column("group") %>%
pivot_longer(-group, values_to = "y") %>%
left_join(A, by = c("group", "name")) %>%
mutate(
mean = if_else(!(is.na(x) | is.na(y)), (x + y)/2, x),
mean = if_else(is.na(mean) & !is.na(y), y, mean)
) %>%
select(-x, -y) %>%
pivot_wider(names_from = name, values_from = mean)
B
## A tibble: 4 x 5
# group A B C D
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 A NA 12.5 20 10
#2 B 12.5 NA 25 21
#3 C 20 25 NA 12.5
#4 D 10 21 12.5 NA
Okay so this is how I ended up doing this. I would have preferred if I didnt use a for loop because the actual data I have is much bigger but beggars cant be choosers!
A <- as.matrix(data[-1])
for (i in 1:nrow(A)){
for (j in 1:ncol(A)){
if(is.na(A[i,j])){
A[i,j] <- A[j, i]
}
}
}
output <- (A + t(A))/2
output %>%
as_tibble() %>%
mutate(group = data$group) %>%
select(group, everything())
# A tibble: 4 x 5
group A B C D
<chr> <dbl> <dbl> <dbl> <dbl>
1 A NA 12.5 20 10
2 B 12.5 NA 25 21
3 C 20 25 NA 12.5
4 D 10 21 12.5 NA
I have a several large data frames that I need to append data to. The data structure is simulated below:
Orders
set.seed(2)
N=1e2
tbl.orders <- tibble(
ID=1:N,
nb_products_ordered = sample(1:15, N, replace = TRUE),
type = sample(c("keyboard", "mouse", "other"), N, replace = TRUE),
grade= sample(LETTERS[1:5], N, replace=TRUE)
)
# A tibble: 100 x 4
ID nb_products_ordered type grade
<int> <int> <chr> <chr>
1 1 5 other A
2 2 15 keyboard A
3 3 6 other C
4 4 6 keyboard E
5 5 8 other C
Bulk pricing. Prices are variable depending on the number of products ordered.
The table shows the minimum number of products at which bulk pricing applies.
Other products are not counted and their price is to be considered "NA".
tbl.prices <- tibble::tribble(
~min_products_ordered, ~per_unit_cost, ~type,
1L, 39.7, "mouse",
2L, 23.1, "mouse",
3L, 18.6, "mouse",
4L, 15, "mouse",
5L, 14.3, "mouse",
6L, 11, "mouse",
9L, 10.9, "mouse",
1L, 11, "keyboard",
9L, 10.9, "keyboard"
)
My convoluted solution, which seems overly complex and is rather slow when running on my large data frames (500K+ rows each). Is there a simpler, faster way? I ultimately want it in a pipe because I have transformations before and after.
tbl.orders%>%
group_by(type)%>%
group_split()%>%
lapply(., function(x)
{
#if included in price list
if (x$type[1] %in% levels(factor(tbl.prices$type))) {
df.priceparameters <- tbl.prices %>%
filter(type == x$type[1])
x %>% mutate(
per_unit_cost =
as.numeric(
as.character(
cut(
x[["nb_products_ordered"]],
breaks = c(df.priceparameters$min_products_ordered, Inf),
#returns per unit cost
labels = df.priceparameters$per_unit_cost,
right = FALSE
)))
)
} else{
x %>% mutate(per_unit_cost = NA)
}
})%>%
do.call("rbind",.)%>%
arrange(ID)
Result
# A tibble: 100 x 5
ID nb_products_ordered type grade per_unit_cost
<int> <int> <chr> <chr> <dbl>
1 1 5 other A NA
2 2 15 keyboard A 10.9
3 3 6 other C NA
4 4 6 keyboard E 11
5 5 8 other C NA
Here is my attempt which also seems bit convoluted :
We write a function to select the correct value of prices for each ID.
library(dplyr)
select_row <- function(type, nb_products_ordered, min_products_ordered){
if(any(type == 'other')) return(TRUE)
else{
tmp <- first(nb_products_ordered) - min_products_ordered
inds <- tmp >= 0
if(any(inds)) return(tmp == min(tmp[inds], na.rm = TRUE))
else TRUE
}
}
Join the dataframe by type and select the row for each ID.
tbl.orders %>%
left_join(tbl.prices, by = 'type') %>%
group_by(ID) %>%
filter(select_row(type, nb_products_ordered, min_products_ordered))
I have a data frame with duplicated ID´s. An ID stands for a specific entity. The ID´s are duplicated because the dataset refers to a process that every entity can go through multiple times.
Here is a small example dat:
library(dplyr)
glimpse(dat)
Observations: 6
Variables: 3
$ ID <dbl> 1, 1, 1, 2, 2, 2
$ Amount <dbl> 10, 70, 80, 50, 10, 10
$ Product <fct> A, B, C, B, E, A
ID stands for the entity, Amount stands for the amount of money the entity has spend and Product stands for the good the entity bought.
The issue is that I have to "condense" this data. So, every ID / entity may occur only once. For the continuous variable, this is not an issue because I can simply calculate the mean per ID.
library(tidyr)
dat_con_ID <- dat %>%
select(ID) %>%
unique()
dat_con_Amount <- dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount))
dat_con <- inner_join(dat_con_ID, dat_con_Amount, by = "ID")
glimpse(dat_con)
Observations: 2
Variables: 2
$ ID <dbl> 1, 2
$ Amount <dbl> 53.33333, 23.33333
The problem is, that I can´t calculate the mean of Product because it´s a categorical variable. An option would be to make a dummy variable out of this factor and calculate the mean. But since the original data frame is really huge this is not a good solution. Any Idea how to handle this problem?
May be you are trying to do this:
I am using data.table library. I also modified your data by adding one extra row for ID = 1, so that you can see the difference in the output.
Data:
library('data.table')
dat <- data.table(ID =as.double(c(1, 1, 1, 2, 2, 2,1)),
Amount = as.double(c( 10, 70, 80, 50, 10, 10, 20)),
Product = factor( c('A', 'B', 'C', 'B', 'E', 'A', 'A')))
Code:
# average amount per id
dat[, .(avg_amt = mean(Amount)), by = .(ID) ]
# ID avg_amt
# 1: 1 45.00000
# 2: 2 23.33333
# average product per id
dat[, .SD[, .N, by = Product ][, .( avg_pdt = N/sum(N), Product)], by = .(ID) ]
# ID avg_pdt Product
# 1: 1 0.5000000 A
# 2: 1 0.2500000 B
# 3: 1 0.2500000 C
# 4: 2 0.3333333 B
# 5: 2 0.3333333 E
# 6: 2 0.3333333 A
# combining average amount and average product per id
dat[, .SD[, .N, by = Product ][, .( Product,
avg_pdt = N/sum(N),
avg_amt = mean(Amount))],
by = .(ID) ]
# ID Product avg_pdt avg_amt
# 1: 1 A 0.5000000 45.00000
# 2: 1 B 0.2500000 45.00000
# 3: 1 C 0.2500000 45.00000
# 4: 2 B 0.3333333 23.33333
# 5: 2 E 0.3333333 23.33333
# 6: 2 A 0.3333333 23.33333
edit
Another idea would be to count 'Product' as per 'ID', calculating the mean of 'Amount' and the relative frequencies for each product. spread the data by 'Product' to end up with the data in wide format. So, every ID / entity may occur only once.
dat %>%
add_count(Product, ID) %>%
group_by(ID) %>%
mutate(Amount = mean(Amount),
n = n / n()) %>%
unique() %>%
spread(Product, n, sep = "_") %>%
ungroup()
# A tibble: 2 x 6
# ID Amount Product_A Product_B Product_C Product_E
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1. 45.0 0.500 0.250 0.250 NA
#2 2. 23.3 0.333 0.333 NA 0.333
My first attempt, not what OP was looking for but in case someone is interested:
As suggested by #steveb in the comments, you could summarise Product as a string.
library(dplyr)
dat %>%
group_by(ID) %>%
summarise(Amount = mean(Amount),
Product = toString( sort(unique(Product)))
)
# A tibble: 2 x 3
# ID Amount Product
# <dbl> <dbl> <chr>
#1 1. 45.0 A, B, C
#2 2. 23.3 A, B, E
data
dat <- structure(list(ID = c(1, 1, 1, 2, 2, 2, 1), Amount = c(10, 70,
80, 50, 10, 10, 20), Product = structure(c(1L, 2L, 3L, 2L, 4L,
1L, 1L), .Label = c("A", "B", "C", "E"), class = "factor")), .Names = c("ID",
"Amount", "Product"), row.names = c(NA, -7L), .internal.selfref = <pointer: 0x2c14528>, class = c("tbl_df",
"tbl", "data.frame"))
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.
I've been using the dplyr package to create aggregated data tables, for example using the following code:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(price1, na.rm = TRUE),2),
mean_price2 = round(mean(price2, na.rm = TRUE),2))
However, I would like to only calculate the mean over the distinct values of price1 and price2 within groups
e.g:
Price1: 1 1 2 1 2 2 1
Goes to (before aggregation):
Price1: 1 2 1 2 1
(and these in general don't have the same numbers of after removal for price1 and price2). I would also like to calculate a count for each (price1 and price2), counting only distinct values within groups. (Groups are defined as two or more identical values adjacent to each other)
I have tried:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(distinct(price1), na.rm = TRUE),2),
mean_price2 = round(mean(distinct(price2), na.rm = TRUE),2))
And also tried wrapping the columns within the select function with distinct(), but both these throw errors.
Is there a way to do this using dplyr or another similar package without having to write something from scratch?
To satisfy your requirement for distinct, we need to remove successive values that are the same. For numeric vectors, this can be accomplished by:
x <- x[c(1, which(diff(x) != 0)+1)]
The default use of diff computes the difference between adjoining elements in the vector. We use this to detect successive values that are different, for which diff(x) != 0. Since the output differences are lagged by 1, we add 1 to the indices of these distinct elements, and we also want the first element as distinct. For example:
x <- c(1,1,2,1,2,2,1)
x <- x[c(1, which(diff(x) != 0)+1)]
##[1] 1 2 1 2 1
We can then use this with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(price1[c(1,which(diff(price1) != 0)+1)]),
mean_price1 = round(mean(price1[c(1,which(diff(price1) != 0)+1)], na.rm=TRUE),2),
count_non_rep_2 = length(price2[c(1,which(diff(price2) != 0)+1)]),
mean_price2 = round(mean(price2[c(1,which(diff(price2) != 0)+1)], na.rm=TRUE),2))
or, better yet, define the function:
remove.repeats <- function(x) {
x[c(1,which(diff(x) != 0)+1)]
}
and use it with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(remove.repeats(price1)),
mean_price1 = round(mean(remove.repeats(price1), na.rm=TRUE),2),
count_non_rep_2 = length(remove.repeats(price2)),
mean_price2 = round(mean(remove.repeats(price2), na.rm=TRUE),2))
Using this on some example data that is hopefully similar to yours:
df <- structure(list(calc.method = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
price1 = c(1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 3),
price2 = c(1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1)),
.Names = c("calc.method", "price1", "price2"), row.names = c(NA, -15L), class = "data.frame")
## calc.method price1 price2
##1 A 1 1
##2 A 1 1
##3 A 2 1
##4 A 1 1
##5 A 2 1
##6 A 2 1
##7 A 1 1
##8 B 1 2
##9 B 1 1
##10 B 2 2
##11 B 2 1
##12 B 2 2
##13 B 2 1
##14 B 1 2
##15 B 3 1
We get:
print(agg_data)
### A tibble: 2 x 6
## calc.method count count_non_rep_1 mean_price1 count_non_rep_2 mean_price2
## <fctr> <int> <int> <dbl> <int> <dbl>
##1 A 7 5 1.40 1 1.0
##2 B 8 4 1.75 8 1.5