Consider the following data frame obtained after a cbind operation on two lists
> fl
x meanlist
1 1 48.5
2 2 32.5
3 3 28.0
4 4 27.0
5 5 25.5
6 6 20.5
7 7 27.0
8 8 24.0
class_median <- list(0, 15, 25, 35, 45)
class_list <- list(0:10, 10:20, 20:30, 30:40, 40:50)
The values in class_median represent classes -10 to +10, 10 to 20, 20 to 30 etc
Firstly, I am trying to group the values in fl$meanlist as per the classes in class_list. Secondly, I am trying to return one value per class which is closest to the median values as follows
> fl_subset
x meanlist cm
1 1 48.5 45
2 2 32.5 35
3 5 25.5 25
I am trying to use loops to compare but it seems to be long and unmanageable and the result is not correct
Here's an approach with dplyr:
library(dplyr)
# do a little prep--name classes, extract breaks, put medians in a data frame
names(class_list) = letters[seq_along(class_list)]
breaks = c(min(class_list[[1]]), sapply(class_list, max))
med_data = data.frame(median = unlist(class_median), class = names(class_list))
fl %>%
# assign classes
mutate(class = cut(meanlist, breaks = breaks, labels = names(class_list))) %>%
# get medians
left_join(med_data) %>%
# within each class...
group_by(class) %>%
# keep the row with the smallest absolute difference to the median
slice(which.min(abs(meanlist - median))) %>%
# sort in original order
arrange(x)
# Joining, by = "class"
# # A tibble: 3 x 4
# # Groups: class [3]
# x meanlist class median
# <int> <dbl> <fct> <dbl>
# 1 1 48.5 e 45
# 2 2 32.5 d 35
# 3 5 25.5 c 25
One approach utilizing purrr and dplyr could be:
map2(.x = class_list,
.y = class_median,
~ fl %>%
mutate(cm = between(meanlist, min(.x), max(.x))) %>%
filter(any(cm)) %>%
mutate(cm = cm*.y)) %>%
bind_rows(.id = "ID") %>%
group_by(ID) %>%
slice(which.min(abs(meanlist-cm)))
ID x meanlist cm
<chr> <int> <dbl> <dbl>
1 3 5 25.5 25
2 4 2 32.5 35
3 5 1 48.5 45
Related
Let's say I have a dataframe of Name and value, is there any ways to extract BOTH minimum and maximum values within Name in a single function?
set.seed(1)
df <- tibble(Name = rep(LETTERS[1:3], each = 3), Value = sample(1:100, 9))
# A tibble: 9 x 2
Name Value
<chr> <int>
1 A 27
2 A 37
3 A 57
4 B 89
5 B 20
6 B 86
7 C 97
8 C 62
9 C 58
The output should contains TWO columns only (Name and Value).
Thanks in advance!
You can use range to get max and min value and use it in summarise to get different rows for each Name.
library(dplyr)
df %>%
group_by(Name) %>%
summarise(Value = range(Value), .groups = "drop")
# Name Value
# <chr> <int>
#1 A 27
#2 A 57
#3 B 20
#4 B 89
#5 C 58
#6 C 97
If you have large dataset using data.table might be faster.
library(data.table)
setDT(df)[, .(Value = range(Value)), Name]
You can use dplyr::group_by() and dplyr::summarise() like this:
library(dplyr)
set.seed(1)
df <- tibble(Name = rep(LETTERS[1:3], each = 3), Value = sample(1:100, 9))
df %>%
group_by(Name) %>%
summarise(
maximum = max(Value),
minimum = min(Value)
)
This outputs:
# A tibble: 3 × 3
Name maximum minimum
<chr> <int> <int>
1 A 68 1
2 B 87 34
3 C 82 14
What's a little odd is that my original df object looks a little different than yours, in spite of the seed:
# A tibble: 9 × 2
Name Value
<chr> <int>
1 A 68
2 A 39
3 A 1
4 B 34
5 B 87
6 B 43
7 C 14
8 C 82
9 C 59
I'm currently using rbind() together with slice_min() and slice_max(), but I think it may not be the best way or the most efficient way when the dataframe contains millions of rows.
library(tidyverse)
rbind(df %>% group_by(Name) %>% slice_max(Value),
df %>% group_by(Name) %>% slice_min(Value)) %>%
arrange(Name)
# A tibble: 6 x 2
# Groups: Name [3]
Name Value
<chr> <int>
1 A 57
2 A 27
3 B 89
4 B 20
5 C 97
6 C 58
In base R, the output format can be created with tapply/stack - do a group by tapply to get the output as a named list or range, stack it to two column data.frame and change the column names if needed
setNames(stack(with(df, tapply(Value, Name, FUN = range)))[2:1], names(df))
Name Value
1 A 27
2 A 57
3 B 20
4 B 89
5 C 58
6 C 97
Using aggregate.
aggregate(Value ~ Name, df, range)
# Name Value.1 Value.2
# 1 A 1 68
# 2 B 34 87
# 3 C 14 82
I'm trying to group a dataset and get the first and highest values based on two separate measures of time and speed. So I need the time and speed for the earliest record in each group and then the time and speed for the fastest record in each group. I've got this far but need some help...
library(tidyverse)
group <- c(1,1,1,1,1,2,2,3,3,4,4,4,4,4,4)
time <- c(1,6,4,5,7,12,10,2,3,8,9,11,13,14,15)
speed <- c(17,6, 99, 34, 12, 5, 67, 43, 23, 12, 15, 78, 61, 78, 20)
data = data.frame(group, time, speed)
summary = data %>%
group_by(group) %>%
summarise(
firstTime = # lowest time
HighestSpeedTime = , # time for highest speed
firstSpeed = , #speed for lowest time
highestSpeed = max(speed), # highest speed
)
Update:
This should work: In group 4 we have ties therefore 2 rows:(we have at two time points the highest speed)!
library(dplyr)
data %>%
group_by(group) %>%
summarise(
firstTime = min(time), # lowest time
HighestSpeedTime = time[which(speed==max(speed))], # time for highest speed
firstSpeed = speed[which(time==min(time))],#speed for lowest time
highestSpeed = max(speed) # highest speed
)
output:
group firstTime HighestSpeedTime firstSpeed highestSpeed
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 4 17 99
2 2 10 10 67 67
3 3 2 2 43 43
4 4 8 11 12 78
5 4 8 14 12 78
Does this work?
library(tidyverse)
group <- c(1,1,1,1,1,2,2,3,3,4,4,4,4,4,4)
time <- c(1,6,4,5,7,12,10,2,3,8,9,11,13,14,15)
speed <- c(17,6, 99, 34, 12, 5, 67, 43, 23, 12, 15, 78, 61, 78, 20)
data = data.frame(group, time, speed)
summary <- data |>
arrange(group, time) |>
group_by(group) |>
summarise(
firsttime = min(time),
highest_speed = max(speed)
) |>
left_join(data, by = c("group", "highest_speed" = "speed")) |>
group_by(group) |>
slice(1) |>
rename(highest_speed_time = time) |>
left_join(data, by = c("group", "firsttime" = "time")) |>
rename(first_speed = speed)
summary
# group firsttime highest_speed highest_speed_time first_speed
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 99 4 17
# 2 10 67 10 67
# 3 2 43 2 43
# 4 8 78 11 12
here is a data.table approach
library(data.table)
setDT(data)
temp <- data[data[, .I[speed == max(speed)], by = .(group)]$V1]
setnames(temp, new = c("group", "maxSpeedTime", "maxSpeed"))
# join together
data[, .(firstTime = time[1],
firstSpeed = speed[1]),
by = .(group)][temp, on = .(group)]
# group firstTime firstSpeed maxSpeedTime maxSpeed
# 1: 1 1 17 4 99
# 2: 2 12 5 10 67
# 3: 3 2 43 2 43
# 4: 4 8 12 11 78
# 5: 4 8 12 14 78
Another solution, with a chained inner_join:
library(tidyverse)
data %>%
group_by(group) %>%
summarise(firstTime = min(time)) %>%
inner_join(data,by=c("group", "firstTime"="time")) %>%
rename(firstSpeed=speed) %>%
inner_join(
data %>%
group_by(group) %>%
summarise(highestSpeed = max(speed)) %>%
inner_join(data,by=c("group", "highestSpeed"="speed"))
) %>%
relocate(highestTime=time, .before="highestSpeed")
#> Joining, by = "group"
#> # A tibble: 5 × 5
#> group firstTime firstSpeed highestTime highestSpeed
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
An alternative solution, based on purrr::map_dfr:
library(tidyverse)
data %>%
group_split(group) %>%
map_dfr(
~ data.frame(
group = .x$group[1],
firstTime = .x$time[min(.x$time) == .x$time],
firstSpeed = .x$speed[min(.x$time) == .x$time],
highestTime = .x$time[max(.x$speed) == .x$speed],
highestSpeed = .x$speed[max(.x$speed) == .x$speed]))
#> group firstTime firstSpeed highestTime highestSpeed
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
And more succinctly:
library(tidyverse)
data %>%
group_split(group) %>%
map_dfr(~ data.frame(
group = integer(), firstTime = integer(), firstSpeed = integer(),
highestTime = integer(), highestSpeed = integer()) %>%
add_row(!!!setNames(c(.x$group[1],.x[min(.x$time) == .x$time, -1],
.x[max(.x$speed) == .x$speed, -1]), names(.))))
#> group firstTime firstSpeed highestTime highestSpeed
#> 1 1 1 17 4 99
#> 2 2 10 67 10 67
#> 3 3 2 43 2 43
#> 4 4 8 12 11 78
#> 5 4 8 12 14 78
For example, I have a dataset called data, and the column names are date min max avg. The total number of rows is 366.
I want to add the each seven rows to get the total value of min. e.g. 1-7 8-14. How can I do this.
If you create a grouping column which increments after every 7 days you may apply all the answers from How to sum a variable by group .
Here's how you can do it in base R.
set.seed(123)
df <- data.frame(Date = Sys.Date() - 365:0, min = rnorm(366), max = runif(366))
df$group <- ceiling(seq(nrow(df))/7)
aggregate(min~group, df, sum)
# group min
#1 1 3.1438325
#2 2 -0.3022263
#3 3 -1.0769539
#4 4 -3.2934430
#5 5 2.8419110
#...
This is a solution based on {tidyverse}, in particular using {dplyr} for the main operations and {lubridate} for formatting your dates.
First simulate some data - as you have not provided a reproducible dataset. I take the year 2020 which has 366 days ... obviously adapt this to your problem.
For the min, max, and average values (columns) , let's generate some random numbers. Again, adapt this to your needs.
library(dplyr) # for general data frame crunching
library(lubridate) # to coerce date-time
data <- data.frame(
date = seq(from = lubridate::ymd("2020-01-01")
, to = lubridate::ymd("2020-12-31"), by = 1)
, min = sample(x = 1:10, size = 366, replace = TRUE)
, max = sample(x = 10:15, size = 366, replace = TRUE)) %>%
dplyr::mutate(avg = mean(min + max))
To group your data, inject a binning / grouping variable.
The following is a generic "every 7th row" based on the modulo operator.
If you want to group by weeks, etc. check out the {lubridate} documentation. You can get some useful bits out of dates for this. Or insert any other binning you need.
data <- data %>%
mutate(bin = c(0, rep(1:(nrow(data)-1)%/%7)))
This yields:
> data
# A tibble: 366 x 5
# Groups: bin [53]
date min max avg bin
<date> <int> <int> <dbl> <dbl>
1 2020-01-01 2 11 18.2 0
2 2020-01-02 6 14 18.2 0
3 2020-01-03 7 13 18.2 0
4 2020-01-04 6 15 18.2 0
5 2020-01-05 3 10 18.2 0
6 2020-01-06 5 12 18.2 0
7 2020-01-07 5 12 18.2 0
8 2020-01-08 7 13 18.2 1
9 2020-01-09 8 11 18.2 1
10 2020-01-10 5 10 18.2 1
We can now summarise our grouped data.
For this you use the bin-variable to group your data, and then summarise to perform aggregations on these groups. Based on your question, the following sums the min-values. Put the function/summary you need:
data %>%
group_by(bin) %>%
summarise(tot_min = sum(min))
# A tibble: 53 x 2
bin tot_min
<dbl> <int>
1 0 34
2 1 31
3 2 35
4 3 44
5 4 40
6 5 50
7 6 46
8 7 38
9 8 33
10 9 21
# ... with 43 more rows
Assign the result to your liking or whatever type of output you need.
If you want to combine this with your original data dataframe, read up on bind_rows().
This question already has answers here:
Proper idiom for adding zero count rows in tidyr/dplyr
(6 answers)
Closed 2 years ago.
Apologies if this is a duplicate question, I saw some questions which were similar to mine, but none exactly addressing my problem.
My data look basically like this:
FiscalWeek <- as.factor(c(45, 46, 48, 48, 48))
Group <- c("A", "A", "A", "B", "C")
Amount <- c(1, 1, 1, 5, 6)
df <- tibble(FiscalWeek, Group, Amount)
df
# A tibble: 5 x 3
FiscalWeek Group Amount
<fct> <chr> <dbl>
1 45 A 1
2 46 A 1
3 48 A 1
4 48 B 5
5 48 C 6
Note that FiscalWeek is a factor. So, when I take a weekly average by Group, I get this:
library(dplyr)
averages <- df %>%
group_by(Group) %>%
summarize(Avgs = mean(Amount))
averages
# A tibble: 3 x 2
Group Avgs
<chr> <dbl>
1 A 1
2 B 5
3 C 6
But, this is actually a four-week period. Nothing at all happened in Week 47, and groups B and C didn't show data in weeks 45 and 46, but I still want averages that reflect the existence of those weeks. So I need to fill out my original data with zeroes such that this is my desired result:
DesiredGroup <- c("A", "B", "C")
DesiredAvgs <- c(0.75, 1.25, 1.5)
Desired <- tibble(DesiredGroup, DesiredAvgs)
Desired
# A tibble: 3 x 2
DesiredGroup DesiredAvgs
<chr> <dbl>
1 A 0.75
2 B 1.25
3 C 1.5
What is the best way to do this using dplyr?
Up front: missing data to me is very different from 0. I'm assuming that you "know" with certainty that missing data should bring all other values down.
The name FiscalWeek suggests that it is an integer-like data, but your use of factor suggests ordinal or categorical. Because of that, you need to define authoritatively what the complete set of factors can be. And because your current factor does not contain all possible levels, I'll infer them (you need to adjust your all_groups_weeks accordingly:
all_groups_weeks <- tidyr::expand_grid(FiscalWeek = as.factor(45:48), Group = c("A", "B", "C"))
all_groups_weeks
# # A tibble: 12 x 2
# FiscalWeek Group
# <fct> <chr>
# 1 45 A
# 2 45 B
# 3 45 C
# 4 46 A
# 5 46 B
# 6 46 C
# 7 47 A
# 8 47 B
# 9 47 C
# 10 48 A
# 11 48 B
# 12 48 C
From here, join in the full data in order to "complete" it. Using tidyr::complete won't work because you don't have all possible values in the data (47 missing).
full_join(df, all_groups_weeks, by = c("FiscalWeek", "Group")) %>%
mutate(Amount = coalesce(Amount, 0))
# # A tibble: 12 x 3
# FiscalWeek Group Amount
# <fct> <chr> <dbl>
# 1 45 A 1
# 2 46 A 1
# 3 48 A 1
# 4 48 B 5
# 5 48 C 6
# 6 45 B 0
# 7 45 C 0
# 8 46 B 0
# 9 46 C 0
# 10 47 A 0
# 11 47 B 0
# 12 47 C 0
full_join(df, all_groups_weeks, by = c("FiscalWeek", "Group")) %>%
mutate(Amount = coalesce(Amount, 0)) %>%
group_by(Group) %>%
summarize(Avgs = mean(Amount, na.rm = TRUE))
# # A tibble: 3 x 2
# Group Avgs
# <chr> <dbl>
# 1 A 0.75
# 2 B 1.25
# 3 C 1.5
You can try this. I hope this helps.
library(dplyr)
#Define range
df %>% mutate(FiscalWeek=as.numeric(as.character(FiscalWeek))) -> df
range <- length(seq(min(df$FiscalWeek),max(df$FiscalWeek),by=1))
#Aggregation
averages <- df %>%
group_by(Group) %>%
summarize(Avgs = sum(Amount)/range)
# A tibble: 3 x 2
Group Avgs
<chr> <dbl>
1 A 0.75
2 B 1.25
3 C 1.5
You can do it without filling if you know number of weeks:
df %>%
group_by(Group) %>%
summarise(Avgs = sum(Amount) / length(45:48))
In my code below, I want to first remove the Variable for which i do not have any value (Ie., F should be removed where all the values are 'NA'). then i am trying to find accumulated values of each Variable. I tried with the following code but i am not getting anything out of it.
library(tidyverse)
set.seed(50)
DF <- data.frame(Days = 1:5, A = runif(5,0,3), S = runif(5,1,6), F = matrix(NA, 5,1), C = runif(5,2,4))
DF_1 <- gather(DF, -Days, key = "Variable", value = "Value")
DF_2 <- DF_1 %>%
filter(Variable == "NA") %>%
mutate(cumulative_Sum = cumsum(Value))
Output
For Variable A I should get something like below- similar for others
> A <- cumsum(DF$A)
> A
[1] 2.126181 3.439161 4.039176 6.340374 7.879859
After grouping by 'Variable', filter out the groups having all NA 'Value', then do the cumulative sum of 'Value' after replacing the NA with 0
library(dplyr)
library(tidyr)
DF_1 %>%
group_by(Variable) %>%
filter(!all(is.na(Value))) %>%
mutate(Value = cumsum(replace_na(Value, 0)))
# A tibble: 15 x 3
# Groups: Variable [3]
# Days Variable Value
# <int> <chr> <dbl>
# 1 1 A 2.13
# 2 2 A 3.44
# 3 3 A 4.04
# 4 4 A 6.34
# 5 5 A 7.88
# 6 1 S 1.22
# 7 2 S 5.72
# 8 3 S 9.95
# 9 4 S 11.2
#10 5 S 12.7
#11 1 C 2.78
#12 2 C 5.32
#13 3 C 8.60
#14 4 C 10.8
#15 5 C 13.3
If we use the 'wide' format 'DF', then use mutate_at
DF %>%
mutate_at(-1, cumsum)