Could you help me to insert the column Category in my generated table? That way I can know specifically the coef for each day and category.
library(purrr)
library(dplyr)
library(tidyverse)
library(lubridate)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-06-30","2021-07-01","2021-07-01"),
Category = c("FDE","ABC","FDE","ABC"),
Week= c("Wednesday","Wednesday","Friday","Friday"),
DR1 = c(4,1,6,3),
DR01 = c(4,1,4,3), DR02= c(4,2,6,2),DR03= c(9,5,4,7),
DR04 = c(5,4,3,2),DR05 = c(5,4,5,4),
DR06 = c(2,4,3,2),DR07 = c(2,5,4,4),
DR08 = c(3,4,5,4),DR09 = c(2,3,4,4)),
class = "data.frame", row.names = c(NA, -4L))
dates <- subset(df1, date2 > date1, select = date2)$date2
map_dfr(dates, ~ {
datas <- df1 %>%
filter(date2 == ymd(.x)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
tibble(dates = .x, coef = coef(mod)[2])
}) %>%
mutate(dates = format(ymd(dates), "%d/%m/%Y"))
# A tibble: 4 x 2
dates coef
<chr> <dbl>
1 30/06/2021 7.89
2 30/06/2021 7.89
3 01/07/2021 7.95
4 01/07/2021 7.95
In this case, it looks like this:
dates Category coef
<chr> <dbl>
1 30/06/2021 FDE 7.89
2 30/06/2021 ABC 7.89
3 01/07/2021 FDE 7.95
4 01/07/2021 ABC 7.95
You can use bind_cols() with a dataset containing the Category column
##Select the Category column as a subset
categories <- subset(df1, date2 > date1, select = Category)
map_dfr(dates, ~ {
datas <- df1 %>%
filter(date2 == ymd(.x)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
tibble(dates = .x[[1]], coef = coef(mod)[2])
}) %>% bind_cols(categories) %>% #add the categories to the tibble
mutate(dates = format(ymd(dates), "%d/%m/%Y"))
Related
I have two dataframes :
> df1 <- data.frame(date = as.Date( c( "2021-06-01", "2021-06-02", "2021-06-03", "2021-06-04",
"2021-06-05", "2021-06-06", "2021-06-07", "2021-06-08",
"2021-06-09", "2021-06-10", "2021-06-11", "2021-06-12",
"2021-06-13") ),
temperature = c( 17, 30, 28, 29, 16, 21, 20, 11, 28, 29, 25, 26, 19) )
and
> df2 <- data.frame( ID = c( 1 : 4 ),
date.pose = as.Date(c("2021-06-01", "2021-06-03", "2021-06-06", "2021-06-10") ),
date.withdrawal = as.Date(c("2021-06-02", "2021-06-05", "2021-06-09", "2021-06-13") ) )
I want to store the mean temperature for each period that is in df2 in a new colomn (df2$mean.temperature).
For ID = 1 from df2, the mean temperature would be calculated with the temperatures from 2021-06-01 and 2021-06-02, witch is mean(17, 30)
In other words, I want to get this :
> df2 <- data.frame(ID = c( 1 : 4 ),
date.pose = as.Date( c("2021-06-01", "2021-06-03", "2021-06-06", "2021-06-10") ) ,
date.withdrawal = as.Date( c("2021-06-03", "2021-06-06", "2021-06-10", "2021-06-13") ),
mean.Temperature = c(23.5, 24.3, 20.0, 24.8) )
I'm trying to add the ID from df2 in a new colomn in df1. Once I do that, I could aggregate like this :
> df3 <- aggregate(df1$temperature, list(df1$ID, df2$date.pose), FUN = mean)
I don't know how to add the corresponding ID in df1.
Or maybe there is a better way to do this?
Here's an approach using uncount from tidyr and some joins.
df2 %>%
mutate(days = (date.witdrawal - date.pose + 1) %>% as.integer) %>%
tidyr::uncount(days, .id = "row") %>%
transmute(ID, date = date.pose + row - 1) %>%
left_join(df1) %>%
group_by(ID) %>%
summarize(mean.Temperature = mean(temperature)) %>%
right_join(df2)
Result
# A tibble: 4 × 4
ID mean.Temperature date.pose date.witdrawal
<int> <dbl> <date> <date>
1 1 23.5 2021-06-01 2021-06-02
2 2 24.3 2021-06-03 2021-06-05
3 3 20 2021-06-06 2021-06-09
4 4 24.8 2021-06-10 2021-06-13
Update. thanks to #Jon Spring:
Here is how we could do it:
logic:
join both df's by date after long pivoting df1
arrange by date and fill
then after grouping by ID use summarise with mean()
and re-join finally:
library(dplyr)
library(tidyr)
df2 %>%
pivot_longer(-ID, values_to = "date") %>%
full_join(df1, by= "date") %>%
arrange(date) %>%
fill(ID, .direction = "down") %>%
group_by(ID) %>%
summarise(mean_temp = mean(temperature, na.rm = TRUE)) %>%
left_join(df2, by="ID")
ID mean_temp date.pose date.witdrawal
<int> <dbl> <date> <date>
1 1 23.5 2021-06-01 2021-06-02
2 2 24.3 2021-06-03 2021-06-05
3 3 20 2021-06-06 2021-06-09
4 4 24.8 2021-06-10 2021-06-13
This question is very similar to this one Error in UseMethod("select") : no applicable method for 'select' applied to an object of class "character", what is different is a few things in the return_coef function and it's also that I'm filtering the function by Id, date and Category, and in the answered question, it was just by date and Category.
You will see below that I can generate the coefficients for each Id/date/Category, but when I ask to do it for everyone at once, I get the following error:
Error: Problem with `mutate()` column `coef`.
i `coef = mapply(...)`.
x negative length vectors are not allowed
Executable code:
library(dplyr)
library(tidyverse)
library(lubridate)
library(data.table)
df1<- structure(
list(
Id = c(1, 1, 1, 1),
date1 = c("2022-01-06","2022-01-06","2022-01-06","2022-01-06"),
date2 = c("2022-01-02","2022-01-03","2022-01-09","2022-01-10"),
Week = c("Sunday","Monday","Sunday","Monday"),
Category = c("EFG", "ABC","EFG","ABC"),
DR1 = c(200, 300, 200, 200),
DRM01 = c(300, 300, 300, 300),
DRM02 = c(300, 300, 300, 300),
DRM03 = c(300,300,300,300),
DRM04 = c(300,300,300,300),
DRM05 = c(300,250,350,350)),row.names = c(NA, 4L), class = "data.frame")
return_coef <- function (df1, idd, dmda, CategoryChosse) {
selection = startsWith(names(df1), "DRM0")
df1[selection][is.na(df1[selection])] = 0
data1<-subset(df1,df1$date2<df1$date1)
dt1 <- as.data.table(data1)
cols <- grep("^DRM0", colnames(dt1), value = TRUE)
med <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
SPV<-df1%>%
inner_join(med, by = c('Id','Category', 'Week')) %>%
mutate(across(matches("^DRM0\\d+$"), ~.x +
get(paste0(cur_column(), '_PV')),
.names = '{col}_{col}_PV')) %>%
select(Id:Category, DRM01_DRM01_PV:last_col())%>%
data.frame()
mat1 <- df1 %>%
dplyr::filter(Id==idd, date2 == ymd(dmda), Category == CategoryChosse) %>%
select(starts_with("DRM0")) %>%
pivot_longer(cols = everything()) %>%
arrange(desc(row_number())) %>%
mutate(cs = cumsum(value)) %>%
dplyr::filter(cs == 0) %>%
pull(name)
(dropnames <- paste0(mat1,"_",mat1, "_PV"))
SPV <- SPV %>%
filter(Id==idd,date2 == ymd(dmda), Category == CategoryChosse) %>%
select(-any_of(dropnames))
if(length(grep("DRM0", names(SPV))) == 0) {
SPV[head(mat1,10)] <- NA_real_
}
datas <-SPV %>%
dplyr::filter(Id==idd,date2 == ymd(dmda)) %>%
group_by(Category) %>%
dplyr::summarize(dplyr::across(starts_with("DRM0"), sum)) %>%
pivot_longer(cols= -Category, names_pattern = "DRM0(.+)", values_to = "val") %>%
mutate(name = readr::parse_number(name))
colnames(datas)[-1]<-c("var1","var2")
datas$days <- datas[[as.name("var1")]]
datas$numbers <- datas[[as.name("var2")]]
datas <- datas %>%
group_by(Category) %>%
slice((as.Date(dmda) - min(as.Date(df1$date1) [
df1$Category == first(Category)])):max(days)+1) %>%
ungroup
m<-df1 %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(starts_with("DR1"), mean), .groups = 'drop')
m<-subset(m, Week == df1$Week[match(ymd(dmda), ymd(df1$date2))] & Category == CategoryChosse)$DR1
if (nrow(datas)<=2){
val<-as.numeric(m)
}
else{
mod <- nls(numbers ~ b1*days^2+b2,start = list(b1 = 0,b2 = 0),data = datas, algorithm = "port")
coef<-coef(mod)[2]
val<-as.numeric(coef(mod)[2])
}
return(val)
}
Find the coef one by one (It works):
return_coef(df1,"1","2022-01-09","EFG")
[1] 200
return_coef(df1,"1","2022-01-10","ABC")
[1] 250
Find all coef at once (Does not work)
subset_df1 <- subset(df1, date2 > date1)
All<-subset_df1%>%
transmute(
Id,date2,Category,
coef = mapply(return_coef, list(cur_data()), Id, as.Date(date2), Category))
Error: Problem with `mutate()` column `coef`.
i `coef = mapply(...)`.
x negative length vectors are not allowed
dput(head(df1))
structure(list(Id = c(1, 1, 1, 1), date1 = structure(c(1641427200,
1641427200, 1641427200, 1641427200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), date2 = structure(c(1641081600, 1641168000, 1641686400,
1641772800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
Week = c("Sunday", "Monday", "Sunday", "Monday"), Category = c("EFG",
"ABC", "EFG", "ABC"), DR1 = c(200, 300, 200, 200), DRM01 = c(300,
300, 300, 300), DRM02 = c(300, 300, 300, 300), DRM03 = c(300,
300, 300, 300), DRM04 = c(300, 300, 300, 300), DRM05 = c(300,
250, 350, 350)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
The function seems to be use df1 as the full data
library(dplyr)
subset_df1 %>%
rowwise %>%
mutate(coef = return_coef(df1, Id, date2, Category)) %>%
ungroup
-output
# A tibble: 2 × 12
Id date1 date2 Week Category DR1 DRM01 DRM02 DRM03 DRM04 DRM05 coef
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2022-01-06 2022-01-09 Sunday EFG 200 300 300 300 300 350 200
2 1 2022-01-06 2022-01-10 Monday ABC 200 300 300 300 300 350 250
Update
In the new dataset, the columns 'date1', 'date2' are POSIXct, convert to Date class and it should work
df2 <- df1 %>%
mutate(across(c(date1, date2), as.Date))
subset_df2 <- subset(df2, date2 > date1)
subset_df2 %>%
rowwise %>%
mutate(coef = return_coef(df2, Id, date2, Category)) %>%
ungroup
# A tibble: 2 × 12
Id date1 date2 Week Category DR1 DRM01 DRM02 DRM03 DRM04 DRM05 coef
<dbl> <date> <date> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2022-01-06 2022-01-09 Sunday EFG 200 300 300 300 300 350 200
2 1 2022-01-06 2022-01-10 Monday ABC 200 300 300 300 300 350 250
Or using pmap
library(purrr)
subset_df1 %>%
transmute(Id, date2, Category,
coeff = pmap_dbl(across(c(Id, date2, Category)),
~ return_coef(df1, ..1, ..2, ..3)))
-output
Id date2 Category coeff
3 1 2022-01-09 EFG 200
4 1 2022-01-10 ABC 250
The code below works for almost all dates and codes that I choose from my df1 database, however only for the day 09/07, code FGE that doesn't. I would like to solve this problem, could you help me? If you need to, I can explain better what this code does.
library(dplyr)
library(tidyverse)
library(lubridate)
library(stringr)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-06-30","2021-07-02","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
DR1 = c(4,1,4,3,3,4,3,5),
DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0),
DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
class = "data.frame", row.names = c(NA, -8L))
dmda<-"2021-07-09"
CodeChosse<-"FGE"
x<-df1 %>% select(starts_with("DR0"))
x<-cbind(df1, setNames(df1$DR1 - x, paste0(names(x), "_PV")))
PV<-select(x, date2,Week, Code, DR1, ends_with("PV"))
med<-PV %>%
group_by(Code,Week) %>%
summarize(across(ends_with("PV"), median))
SPV<-df1%>%
inner_join(med, by = c('Code', 'Week')) %>%
mutate(across(matches("^DR0\\d+$"), ~.x +
get(paste0(cur_column(), '_PV')),
.names = '{col}_{col}_PV')) %>%
select(date1:Code, DR01_DR01_PV:last_col())
SPV<-data.frame(SPV)
mat1 <- df1 %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(starts_with("DR0")) %>%
pivot_longer(cols = everything()) %>%
arrange(desc(row_number())) %>%
mutate(cs = cumsum(value)) %>%
filter(cs == 0) %>%
pull(name)
(dropnames <- paste0(mat1,"_",mat1, "_PV"))
SPV %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(-dropnames)
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
Error: Can't subset columns that don't exist.
x Column `__PV` doesn't exist.
Run `rlang::last_error()` to see where the error occurred.
For others datas/codes it works, for example:
dmda<-"2021-06-30"
CodeChosse<-"FDE"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-06-30 FDE 4 4 4 4 4 4 4 4
DR09_DR09_PV DR010_DR010_PV DR011_DR011_PV
1 4 4 4
dmda<-"2021-07-02"
CodeChosse<-"ABC"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-07-02 ABC 4 3 5 4.5 5.5 1.5 2 3.5
DR09_DR09_PV DR010_DR010_PV DR011_DR011_PV DR012_DR012_PV DR013_DR013_PV
1 3.5 3.5 3.5 3.5 3
dmda<-"2021-07-07"
CodeChosse<-"CDE"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-07-07 CDE 3 3 3 3 3 3 3 3
The issue in this particular case is that mat1 has no values in it because the 1st row in df1 has non-zero value and filter(cs == 0) drops all the rows.
mat1
#character(0)
You may use any_of which will not give an error if the column does not exist.
library(dplyr)
SPV %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(-any_of(dropnames))
I have a list of countries with lists inside each one of them.
Just to give you an example of a list object for one country with lists for two countries (df_DOTS):
df_DOTS <- list(BR = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "BR", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07", "2019-08",
"2019-09"), `#OBS_VALUE` = c("55.687747", "36.076581", "57.764474"
)), class = "data.frame", row.names = c(NA, 3L)))), row.names = 2L, class = "data.frame"),
US = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "US", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07",
"2019-08", "2019-09"), `#OBS_VALUE` = c("876.025841",
"872.02118", "787.272851")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1L, class = "data.frame"))
I can reach the matrix (matrix_DOTS) I am looking for using these lines of code:
library(dplyr)
library(rlist)
library(magrittr)
BR <- df_DOTS[["BR"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
US <- df_DOTS[["US"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
matrix_DOTS <- matrix(c(BR, US), nrow = 1, dimnames = list(c("AU"), c("BR", "US")))
Since I have a list of several countries with lists of other several countries inside them, I am looking for a more practical way of achieving matrix_DOTS. Any help is highly appreciated!
PS: This is the dput for the final matrix in this example:
matrix_DOTS <- structure(c(49.842934, 845.106624), .Dim = 1:2, .Dimnames = list(
"AU", c("BR", "US")))
EDIT
This is the procedure to obtain df_DOTS:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery) %>%
split(.$`#COUNTERPART_AREA`)
Just add tidy = TRUE to the CompactDataMethod call:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID,
queryfilter,
startdate,
enddate,
checkquery,
tidy = TRUE)
df_DOTS
#TIME_PERIOD #OBS_VALUE #FREQ #REF_AREA #INDICATOR #COUNTERPART_AREA #UNIT_MULT #TIME_FORMAT
1 2019-07 876.025841 M AU TXG_FOB_USD US 6 P1M
2 2019-08 872.02118 M AU TXG_FOB_USD US 6 P1M
3 2019-09 787.272851 M AU TXG_FOB_USD US 6 P1M
4 2019-07 55.687747 M AU TXG_FOB_USD BR 6 P1M
5 2019-08 36.076581 M AU TXG_FOB_USD BR 6 P1M
6 2019-09 57.764474 M AU TXG_FOB_USD BR 6 P1M
you just need one group_by(#COUNTERPART_AREA) %>% summarise(mean = mean(#OBS_VALUE)):
library(tidyverse)
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean)
#output
`#REF_AREA` BR US
<chr> <dbl> <dbl>
1 AU 49.8 845.
Or if you insist on a matrix
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean) %>%
column_to_rownames("#REF_AREA") %>%
as.matrix
#output
BR US
AU 49.84293 845.1066
From the input data, we could loop over with map, pluck the elements that is needed, convert to numeric, get the mean, and convert to a two column tibble with enframe
library(purrr)
library(tidyr)
map(df_DOTS, ~ .x %>%
pluck("Obs", 1, "#OBS_VALUE") %>%
as.numeric %>%
mean) %>%
enframe %>%
unnest(c(value))
# A tibble: 2 x 2
# name value
# <chr> <dbl>
#1 BR 49.8
#2 US 845.
Another option would be like this:
tmp <- df_DOTS %>%
as_tibble() %>%
summarise(across(everything(), ~mean(as.numeric(.x$Obs[[1]]$`#OBS_VALUE`))))
tmp
# # A tibble: 1 x 2
# BR US
# <dbl> <dbl>
# 1 49.8 845.
I have the data in a data frame, with the first column is date and the second column is individual weight. Here's a sample from the data:
df <- data.frame(
date = c("2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01",
"2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01",
"2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02", "2019-01-02",
"2019-01-02", "2019-01-02", "2019-01-02", "2019-01-02",
"2019-01-02", "2019-01-02", "2019-01-02"),
weight = c(2174.8, 2174.8, 2174.8, 8896.53, 8896.53, 2133.51, 2133.51,
2892.32, 2892.32, 2892.32, 2892.32, 5287.78, 5287.78, 6674.03,
6674.03, 6674.03, 6674.03, 6674.03, 5535.11, 5535.11)
)
I would like to run simple summary statistic for each date first and then find number of records whose weight is in the given range, defining the category by the % of total range of weights. Finally store number of each record in a separate column
Lowest 10%
10-20%
20-40%
40-60%
60-80%
80-90%
90-100%
The logic = (MinWeight + (MaxWeight-MinWeight)*X%)
Here is my expected outcome ( I only show two columns for % range)
df %>%
group_by(date) %>%
summarise(mean(weight), min(weight), max(weight))
date `mean(weight)` `min(weight)` `max(weight)` `Lowest 10%` `10-20%`
2019-01-01 3726. 2134. 8897. num records. num records.
Check this solution:
library(tidyverse)
library(wrapr)
df %>%
group_by(date) %>%
mutate(
rn = row_number(),
temp = weight - min(weight),
temp = (temp / max(temp)) * 100,
temp = cut(temp, seq(0, 100, 10), include.lowest = TRUE),
temp = str_remove(temp, '\\(|\\[') %>%
str_replace(',', '-') %>%
str_replace('\\]', '%'),
one = 1
) %>%
spread(temp, one, fill = 0) %.>%
left_join(
summarise(.,
`mean(weight)` = mean(weight),
`min(weight)` = min(weight),
`max(weight)` = max(weight)
),
summarise_at(., vars(matches('\\d+-\\d+.')), sum)
)
Output:
date `mean(weight)` `min(weight)` `max(weight)` `0-10%` `10-20%` `60-70%` `90-100%`
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2019-01-01 3726. 2134. 8897. 5 3 0 2
2 2019-01-02 5791. 2892. 6674. 1 0 4 5
Could be done this way:
library(tidyverse)
df %>%
group_by(date) %>%
mutate(
wrange = cut((weight - min(weight)) / (max(weight - min(weight))) * 100, 10,
labels = paste(
seq(0, 90, by = 10),
paste0(seq(10, 100, by = 10), "%"),
sep = '-')
)
) %>%
left_join(
x = summarise_at(., vars(weight), funs(mean, min, max)),
y = count(., wrange) %>% complete(wrange, fill = list(n = 0)) %>% spread(wrange, n),
by = 'date'
) %>%
rename_at(vars(matches("mean|min|max")), funs(paste(., "(weight)", sep = "")))
Which outputs:
# date mean(weight) min(weight) max(weight) 0-10% 10-20% 20-30% 30-40% 40-50%
# 1 2019-01-01 3726.144 2133.51 8896.53 5 3 0 0 0
# 2 2019-01-02 5790.825 2892.32 6674.03 1 0 0 0 0
# 50-60% 60-70% 70-80% 80-90% 90-100%
# 0 0 0 0 2
# 0 4 0 0 5
(I reformatted the output, to show all the data)