calculating medians per year per ID in R and plotting the outcome - r

Dataset:
structure(list(ID = c(1234, 1234, 1234, 1234, 1234, 1234, 1234,
1234, 8769, 8769, 8769, 8769, 8769, 7457, 7457, 7457, 7457, 7457,
7457, 55667, 55667, 55667, 55667, 55667, 55667, 55667, 3789,
3789, 3789, 3789, 3789, 3789), date_of_bloods = structure(c(978307200,
981072000, 1173052800, 1175731200, 1367798400, 1465171200, 1467936000,
1659916800, 1072915200, 1075680000, 1173052800, 1175731200, 1367798400,
978307200, 981072000, 1173052800, 1175731200, 1367798400, 1465171200,
978307200, 981072000, 1173052800, 1270425600, 1273104000, 1465171200,
1467936000, 1270425600, 1367798400, 1465171200, 1465257600, 1465344000,
1465430400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
result = c(90, 80, 60, 40, 25, 22, 22, 21, 70, 65, 43, 23,
22, 90, 90, 88, 86, 76, 74, 58, 46, 35, 34, 33, 30, 24, 76,
67, 56, 34, 33, 23), `mutation type` = c(1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -32L), class = "data.frame")
I would like the median of results per year per ID in a format where the year is just 0,1,2,3 etc for uniformity across cohorts and then to plot these lines with some indication of their mutation category.
I have done:
filtered$date_of_bloods <-format(filtered$date_of_bloods,format="%Y")
#split into individual ID groups
a <- with(filtered, split(filtered, list(ID)))
#aggregate median results per year
medianfunc <- function(y) {aggregate(results ~ date_of_bloods, data = y, median)}
medians <- sapply(a, medianfunc)
# do lm per ID cohort and get slope of lines
g<- as.data.frame(medians)
coefLM <- function(x) {coef(lm(date_of_bloods ~ results, data = x))}
coefs<- sapply(g, coefLM)
The actual years don't matter and for uniformity I would like them to be 0,1,2,3,4 etc per ID. I am not sure how to do that? I would then want to plot this data (median yearly bloods per ID) with some form of idea as to which mutational category they belong.
I hope this isn't too broad a question.
Many thanks

You can try this (filtered is the dput() you included). I hope this helps:
library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)
#Data
filtered %>% mutate(year=year(date_of_bloods)) %>%
group_by(ID,year,`mutation type`) %>% summarise(med=median(result)) -> df1
#Variables
df1 %>% ungroup()%>% mutate(ID=as.factor(ID),
year=as.factor(year),
`mutation type`=as.factor(`mutation type`)) -> df1
#Plot
ggplot(df1,aes(x=ID,y=med,fill=`mutation type`,color=year,group=year))+
geom_line()
And for models:
#Models
fits <- df1 %>%group_by(ID) %>%
do(fitmodel = lm(med ~ year, data = .))
#Coefs
dfCoef = tidy(fits, fitmodel)
# A tibble: 10 x 6
# Groups: ID [5]
ID term estimate std.error statistic p.value
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1234 (Intercept) 6329. 1546. 4.09 0.0264
2 1234 year -3.13 0.769 -4.07 0.0268
3 3789 (Intercept) 14318. 4746. 3.02 0.204
4 3789 year -7.08 2.36 -3.00 0.205
5 7457 (Intercept) 2409. 403. 5.98 0.0269
6 7457 year -1.16 0.201 -5.78 0.0287
7 8769 (Intercept) 9268. 4803. 1.93 0.304
8 8769 year -4.60 2.39 -1.92 0.306
9 55667 (Intercept) 3294. 759. 4.34 0.0492
10 55667 year -1.62 0.378 -4.29 0.0503
Code for required plot:
#Plot 2
#Data modifications
df1 %>% mutate(year2=as.numeric(year)-1) -> df2
df2 %>% mutate(year2=factor(year2,levels = sort(unique(year2)))) -> df2
#Plot 2
ggplot(df2,aes(x=year2,y=med,color=ID,group=ID))+
facet_wrap(.~`mutation type`)+
geom_line()

Your naming structure is unclear, if the data you provided is called df then you can do:
df$year <-format(df$date_of_bloods,format="%Y")
aggregate(result ~ year + ID, data = df, median)
year ID result
1 2001 1234 85.0
2 2007 1234 50.0
3 2013 1234 25.0
4 2016 1234 22.0
5 2022 1234 21.0
6 2010 3789 76.0
7 2013 3789 67.0
8 2016 3789 33.5
9 2001 7457 90.0
10 2007 7457 87.0
11 2013 7457 76.0
12 2016 7457 74.0
13 2004 8769 67.5
14 2007 8769 33.0
15 2013 8769 22.0
16 2001 55667 52.0
17 2007 55667 35.0
18 2010 55667 33.5
19 2016 55667 27.0

Related

Creating New Age group

I have some age(12:54) and related data for them (here year and ASFR). The year starts from 1933 to 1987.
The structure of the data is something like ensuing:
year
age
Asfr
1933
12
.00004
1933
13
.00044
1933
14
.00177
1933
15
.00672
1933
16
.01875
1933
17
.03846
1933
18
.06586
1933
19
.08719
...
...
...
1933
49
.00037
1933
50
.00009
1933
51
.00003
1933
52
.00003
1933
53
.00003
1933
54
.00002
Now, I need codes by which I can turn this data into age groups with the following structure:
"15-19" , "20-24", "25-29", "30-34", "35-39" ,"40-44", "45-49"
in which I want 15-19 age group be the sum of 12, 13, 14, 15, 16, 17, 18, 19
20-24 age group be the sum of 20, 21, 22, 23, 24
Finally, the last age group be the sum of 45, 46, 47, 48, 49, 50, 51, 52, 53,54
I would really appreciate it if someone could help me. Thank you so much in advance.
You can use case_when from dplyr:
library(dplyr)
df %>%
mutate(age_group = case_when(age %in% c(12:19) ~ "15-19",
age %in% c(20:24) ~ "20-24",
age %in% c(25:29) ~ "25-29",
age %in% c(30:34) ~ "30-34",
age %in% c(35:39) ~ "35-39",
age %in% c(40:44) ~ "40-44",
age %in% c(45:49) ~ "45-49",
age > 49 ~ "50+")) %>%
group_by(age_group, year) %>%
summarize(total_asfr = sum(Asfr),
age_group_n = n()) %>%
ungroup()
This gives us:
# A tibble: 5 × 3
age_group total_asfr age_group_n
<chr> <dbl> <int>
1 15-19 0.0385 2
2 20-24 0.00044 1
3 30-34 0.00177 1
4 45-49 0.00672 1
5 50+ 0.0188 1
Using sample data:
df <- structure(list(year = c(1933L, 1933L, 1933L, 1933L, 1933L, 1933L
), age = c(12L, 23L, 34L, 45L, 56L, 17L), Asfr = c(4e-05, 0.00044,
0.00177, 0.00672, 0.01875, 0.03846)),
row.names = c(NA, -6L),
class = "data.frame")
Here's a possible solution:
# Import tidyverse or dplyr
library(tidyverse)
#create the age groups and group by Year and age_groups
df %>% mutate(age_groups = cut(df$age,
breaks=c(12, 20, 25, 30, 35, 40, 45,55),
right= F) ) %>%
group_by(year, age_groups) %>%
summarise(asfr_total = sum(Asfr))
You should see something like this:
year age_groups asfr_total
<dbl> <fct> <dbl>
1 1933 [12,20) 4.32
2 1933 [20,25) 2.33
3 1933 [25,30) 2.68
4 1933 [30,35) 2.89
5 1933 [35,40) 2.23
6 1933 [40,45) 2.85
7 1933 [45,55) 6.05

Dumbbell plot: Order and value label

I am working on a dumbbell plot in R inspired by this post, and have two problems:
Ordering the dumbbell plot (I've tried a strategy provided in this post)
Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))
Here an easy way to order your plot is to use arrange function from dplyr to sort your dataframe according column(s) of your choice and then format the grouping value (county_label) as factor with the ranked elements:
library(dplyr)
library(ggplot2)
DF %>% arrange(adhd_2010) %>% mutate(county_label = factor(county_label, unique(county_label))) %>%
ggplot(aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
It is not perfect but at least your values are quite ordered. After you can change the column by which you wish to order your plot (here I order based on adhd_2010)
# Reformat data
DF2<-DF%>% arrange(desc(adhd_2010))
DF3<-DF%>% mutate("key" = "Change in Prescription Rate")
DF3$county_label<-factor(DF3$county_label,DF2$county_label)
DF3$adhd_2018<-signif(DF3$adhd_2018, digits = 3)
DF3$adhd_2010<-signif(DF3$adhd_2010, digits = 3)
# Plot
dumbbell::dumbbell(DF3, id="county_label", key="key", column1="adhd_2010", column2="adhd_2018", lab1="2010", lab2="2018", delt=1, textsize = 2, pt_val =1, ,pointsize = 2) + xlim(13,43) +
labs(x=NULL, y=NULL, title="Dumbbell Chart",subtitle="Change in prescription rate: 2010 vs 2018")
I added in a few bells and whistles, just toggle the options to remove
I hope someone finds it useful
Enjoy!

how to use an element of a character vector as a symbol argument to a function using non-standard evaluation := operator

I'm trying to write a function that accepts a character vector of variable names as symbolic arguments.
Here is some data taken from the "fertility" dataset in the questionr package. The important thing is that it includes some columns of labelled data.
library(tidyverse)
library(labelled)
df <- structure(list(id_woman = structure(c(391, 1643, 85, 881, 1981,
1072, 1978, 1607, 738), label = "Woman Id",
format.spss = "F8.0"),
weight = structure(c(1.80315, 1.80315, 1.80315, 1.80315,
1.80315, 0.997934, 0.997934, 0.997934, 0.192455),
label = "Sample weight", format.spss = "F8.2"),
residency = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2),
label = "Urban / rural residency",
labels = c(urban = 1, rural = 2),
class = "haven_labelled"),
region = structure(c(4, 4, 4, 4, 4, 3, 3, 3, 3), label = "Region",
labels = c(North = 1, East = 2, South = 3, West = 4),
class = "haven_labelled")),
row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
This function simply takes a variable name and converts it from labelled data to a factor.
my.func <- function(var){
df %>%
mutate({{var}} := to_factor({{var}}))
}
Both of these lines work.
my.func(residency)
my.func("residency")
They return this:
id_woman weight residency region
<dbl> <dbl> <fct> <dbl+lbl>
1 391 1.80 rural 4 [West]
2 1643 1.80 rural 4 [West]
3 85 1.80 rural 4 [West]
4 881 1.80 rural 4 [West]
5 1981 1.80 rural 4 [West]
6 1072 0.998 rural 3 [South]
7 1978 0.998 rural 3 [South]
8 1607 0.998 rural 3 [South]
9 738 0.192 rural 3 [South]
The trouble comes if I try to provide the variable name as part of a vector, like this:
var.names <- c("residency", "region")
my.func(var.names[1])
Error: The LHS of `:=` must be a string or a symbol
Call `rlang::last_error()` to see a backtrace
I tried this, but it also failed.
my.func(rlang::sym(var.names[1]))
Error: The LHS of `:=` must be a string or a symbol
Call `rlang::last_error()` to see a backtrace
In this case, we have to evaluate (!!)
my.func(!!var.names[1])
# A tibble: 9 x 4
# id_woman weight residency region
# <dbl> <dbl> <fct> <dbl+lbl>
#1 391 1.80 residency 4 [West]
#2 1643 1.80 residency 4 [West]
#3 85 1.80 residency 4 [West]
#4 881 1.80 residency 4 [West]
#5 1981 1.80 residency 4 [West]
#6 1072 0.998 residency 3 [South]
#7 1978 0.998 residency 3 [South]
#8 1607 0.998 residency 3 [South]
#9 738 0.192 residency 3 [South]

Use dplyr to filter dataframe by 32 conditions stored in a 2nd dataframe

Let me dive right into a reproducible example here:
Here is the dataframe with these "possession" conditions to be met for each team:
structure(list(conferenceId = c("A10", "AAC", "ACC", "AE", "AS",
"BIG10", "BIG12", "BIGEAST", "BIGSKY", "BIGSOUTH", "BIGWEST",
"COLONIAL", "CUSA", "HORIZON", "IVY", "MAAC", "MAC", "MEAC",
"MVC", "MWC", "NE", "OVC", "PAC12", "PATRIOT", "SEC", "SOUTHERN",
"SOUTHLAND", "SUMMIT", "SUNBELT", "SWAC", "WAC", "WCC"), values = c(25.5,
33.625, 57.65, 16, 20.9, 48.55, 63.9, 45, 17.95, 28, 11, 24.4,
23.45, 10.5, 16, 12.275, 31.5, 10.95, 21.425, 36.8999999999999,
31.025, 18.1, 23.7, 19.675, 52.9999999999997, 24.5, 15, 27.5,
12.6, 17.75, 13, 33)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -32L))
> head(poss_quantiles)
# A tibble: 6 x 2
conferenceId values
<chr> <dbl>
1 A10 25.5
2 AAC 33.6
3 ACC 57.6
4 AE 16
5 AS 20.9
6 BIG10 48.5
My main dataframe looks as followed:
> head(stats_df)
# A tibble: 6 x 8
season teamId teamName teamMarket conferenceName conferenceId possessions games
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int>
1 1819 AFA Falcons Air Force Mountain West MWC 75 2
2 1819 AKR Zips Akron Mid-American MAC 46 3
3 1819 ALA Crimson Tide Alabama Southeastern SEC 90.5 6
4 1819 ARK Razorbacks Arkansas Southeastern SEC 71.5 5
5 1819 ARK Razorbacks Arkansas Southeastern SEC 42.5 5
6 1819 ASU Sun Devils Arizona State Pacific 12 PAC12 91.5 7e: 6 x 8
> dim(stats_df)
[1] 6426 500
I need to filter the main dataframe stats_df so that each conference's possessions is greater than their respective possession value in the poss_quantiles dataframe. I am struggling to figure out the best way to do this w/ dplyr.
I believe the following is what the question asks for.
I have made up a dataset to test the code. Posted at the end.
library(dplyr)
stats_df %>%
inner_join(poss_quantiles) %>%
filter(possessions > values) %>%
select(-values) %>%
left_join(stats_df)
# conferenceId possessions otherCol oneMoreCol
#1 s 119.63695 -1.2519859 1.3853352
#2 d 82.68660 -0.4968500 0.1954866
#3 b 103.58936 -1.0149620 0.9405918
#4 o 139.69607 -0.1623095 0.4832004
#5 q 76.06736 0.5630558 0.1319336
#6 x 86.19777 -0.7733534 2.3939706
#7 p 135.80127 -1.1578085 0.2037951
#8 t 136.05944 1.7770844 0.5145781
Data creation code.
set.seed(1234)
poss_quantiles <- data.frame(conferenceId = letters[sample(26, 20)],
values = runif(20, 50, 100),
stringsAsFactors = FALSE)
stats_df <- data.frame(conferenceId = letters[sample(26, 20)],
possessions = runif(20, 10, 150),
otherCol = rnorm(20),
oneMoreCol = rexp(20),
stringsAsFactors = FALSE)

How to group sales data using R?

I am trying to find a way to match new products with the products those I have historical data. Then I will use historical data from the preview years' products to make some prediction for the new products.
Please consider the following subset of the data:
# A tibble: 13 x 11
prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_season January February March April sales_total
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.00 WUW SW BH B21 2017 2.00 10.0 5.00 4.00 21.0
2 2.00 WUW SW BK R21 2017 7.00 9.00 4.00 5.00 25.0
3 3.00 MUW NW UW P1 2018 6.00 8.00 10.0 6.00 32.0
4 4.00 LNG KW LW L1 2016 8.00 9.00 12.0 7.00 36.0
5 5.00 QKQ MZ KA AQ 2013 10.0 8.67 16.7 8.00 43.3
6 6.00 MUW NW UW P1 2019 0 0 0 0 0
7 7.00 WUW SW BK R21 2019 0 0 0 0 0
8 8.00 LNG NW UW P2 2014 15.1 8.67 28.7 11.0 63.4
9 9.00 QKQ KW LW L2 2016 16.8 8.67 32.7 12.0 70.1
10 10.0 WUW MZ KA AQ 2017 18.5 8.67 36.7 13.0 76.8
11 11.0 QKQ MZ KA AQ 2019 0 0 0 0 0
12 12.0 WUW MZ KA AQ 2019 0 0 0 0 0
13 13.0 MUW NW UW P1 2019 0 0 0 0 0
prdct_grp stands for a product group (for example prdct_grp_1=WUW means the product is in "women underwear" and prdct_grp_2=SW will specify that it is in the "swimwear" group and so on). If a product in the same prdct_grp from(1-4) then I will assume that they will have very similar sales figures.
I would like to have the following outcome
# A tibble: 3 x 11
new_prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_s January February March April sales_total
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6~3 MUW NW UW P1 2019 6.00 8.00 10.0 6.00 32.0
2 7~2 WUW SW BK R21 2019 7.00 9.00 4.00 5.00 25.0
3 11~5 QKQ MZ KA AQ 2019 10.0 9.00 17.0 8.00 43.0
I used tidyverse to have the outcome I wanted to have but the result was not very good.
If a product matches more than one product or match to another product which has start season 2019 is another problem. how could I handle this?
Thank you for your help.
Best
A
Below is a possible dplyr solution along with detailed comments. Please always make sure that your problem is reproducible by providing dput() output or at least a code snippet for creating your dataset.
# import required package
library(dplyr)
# reproduce your data frame (or at least something similar to it)
# please give more details next time
prdct_df <- data_frame(
prdct_id = 1:13,
prdct_grp_1 = c("WUW", "WUW", "MUW", "LNG", "QKQ", "MUW", "WUW", "LNG", "QKQ", "WUW", "QKQ", "WUW", "MUW"),
prdct_grp_2 = c("SW", "SW", "NW", "KW", "MZ", "NW", "SW", "NW", "KW", "MZ", "MZ", "MZ", "NW"),
prdct_grp_3 = c("BH", "BK", "UW", "LW", "KA", "UW", "BK", "UW", "LW", "KA", "KA", "KA", "UW"),
prdct_grp_4 = c("B21", "R21", "P1", "L1", "AQ", "P1", "R21", "P2", "L2", "AQ", "AQ", "AQ", "P1"),
Start_season = c(2017, 2017, 2018, 2016, 2013, 2019, 2019, 2014, 2016, 2017, 2019, 2019, 2019),
January = c(2, 7, 6 , 8, 10, 0, 0, 15.1, 16.8, 18.5, 0, 0, 0),
February = c(10, 9, 8, 9, 8.67, 0, 0, 8.86, 8.67, 8.67, 0, 0, 0),
March = c(4, 5, 10, 12, 16.7, 0, 0, 28.7, 32.7, 36.7, 0, 0, 0),
April = c(4, 5, 6, 7, 8, 0, 0, 11, 12, 13, 0, 0, 0),
sales_total = c(21, 25, 32, 36, 43.3, 0, 0, 63.4, 70.1, 76.8, 0, 0, 0)
)
# define new season in case you have additional seasons in the furture
new_prdct_seasons <- 2019 # with new seasons: c(2019, 2020, 2012) and so on
# keep the historical and new data separate (optional but clean)
# filter your data to separate new products
new_prdct_df <- prdct_df %>%
filter(Start_season %in% new_prdct_seasons)
# filter your data to separate old products
old_prdct_df <- prdct_df %>%
filter(!(Start_season %in% new_prdct_seasons))
# match the new and old products to get the data frame you want
final_df <- old_prdct_df %>%
inner_join(
# only the first 6 columns are needed from new product data frame
new_prdct_df[1:6],
# inner join by product group features
by = c("prdct_grp_1", "prdct_grp_2", "prdct_grp_3", "prdct_grp_4")
) %>%
# reorder the columns and change their names when necessary
select(
new_prdct_id = 12,
old_prdct_id = 1,
2:5,
Start_season = 13,
7:11
)
# we obtained the data frame you asked for
# note that we avoided matches among new products by keeping new and old products in distinct data frames
final_df
# # A tibble: 5 x 12
# new_prdct_id old_prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_season January
# <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 7 2 WUW SW BK R21 2019 7
# 2 6 3 MUW NW UW P1 2019 6
# 3 13 3 MUW NW UW P1 2019 6
# 4 11 5 QKQ MZ KA AQ 2019 10
# 5 12 10 WUW MZ KA AQ 2019 18.5
# # ... with 4 more variables: February <dbl>, March <dbl>, April <dbl>, sales_total <dbl>
# you can also exclude matches with more than one old product if needed
final_df[-3, ] # this removes the match 13-3 as there is already 6-3

Resources