Getting cutoff values for each ntile group - r

I have some data which looks like:
# A tibble: 52 × 3
provincia mean_price number_properties
<chr> <dbl> <int>
1 A Coruña 179833. 2508
2 Albacete 148584. 1311
3 Alicante 418148. 22676
4 Almería 142338. 3902
5 Araba - Álava 243705. 786
I am trying to compute the ntiles of the data which I can do.
df %>%
mutate(
bins = ntile(mean_price, 5)
)
So, I want to bin the data into 5 groups. However, I also would like to extract the cutoff points that were used to create each bin. I know the quantiles function would get me what I wanted but I have to pass it 52 as an input parameter (number of observations I have)- However, I would like to have just 5 groups.
Code:
x = df %>%
mutate(
bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price, probs = seq(0, 1, length.out = 52), na.rm = TRUE)
)
How can I get the values for the cutoffs from the mean_price column for each ntile?
Data:
df <- structure(list(provincia = c("A Coruña", "Albacete", "Alicante",
"Almería", "Araba - Álava", "Asturias", "Badajoz", "Barcelona",
"Bizkaia", "Burgos", "Cantabria", "Castellón", "Ceuta", "Ciudad Real",
"Cuenca", "Cáceres", "Cádiz", "Córdoba", "Gipuzkoa", "Girona",
"Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears",
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo",
"Madrid", "Melilla", "Murcia", "Málaga", "Navarra", "Ourense",
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife",
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo",
"Valencia", "Valladolid", "Zamora", "Zaragoza", "Ávila"), mean_price = c(179833.167862839,
148583.87109077, 418148.151437643, 142337.792926704, 243704.750636132,
149179.732438607, 147802.894486692, 462813.775190776, 286461.604484305,
134635.540239044, 201214.437926878, 139819.15323646, 204318.181818182,
113856.401715511, 116992.998609179, 143137.441025641, 251270.65520481,
166586.746650426, 340663.440746753, 365733.360551724, 219325.194605466,
173221.569037657, 142430.992912371, 141754.603535354, 749037.540922619,
131396.292428198, 160838.235963581, 656553.777123633, 140312.919506463,
160485.155614973, 129787.667711599, 433665.652781242, 234306.12244898,
188635.237652749, 687127.622056842, 197388.050991501, 137829.640394089,
123450.853813559, 230344.604904632, 159999.046077894, 328912.419004007,
184829.536144578, 180692.511036468, 155759.235955056, 193651.956693536,
128909.006756757, 132623.629553967, 177825.529404212, 165240.297002725,
116820.125531915, 164934.932635983, 124977.724215247), number_properties = c(2508L,
1311L, 22676L, 3902L, 786L, 3502L, 2104L, 35906L, 3345L, 1255L,
2489L, 4542L, 44L, 1399L, 719L, 1170L, 5322L, 5747L, 1232L, 7250L,
16906L, 956L, 1552L, 396L, 5376L, 1915L, 1318L, 2378L, 1702L,
1870L, 638L, 16036L, 49L, 7856L, 15587L, 706L, 1421L, 472L, 2569L,
1823L, 3494L, 498L, 6252L, 89L, 6327L, 296L, 2937L, 15576L, 1468L,
470L, 2390L, 446L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-52L))

We can use quantile output within cut as mentioned in the comments
library(dplyr)
df %>%
mutate(bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price,
probs = seq(0, 1, length.out = 52), na.rm = TRUE),
cut_bins = cut(mean_price, breaks = bins_cutpoints) )
-output
# A tibble: 52 × 6
provincia mean_price number_properties bins bins_cutpoints cut_bins
<chr> <dbl> <int> <int> <dbl> <fct>
1 A Coruña 179833. 2508 30 113856. (1.778e+05,1.798e+05]
2 Albacete 148584. 1311 19 116820. (1.478e+05,1.486e+05]
3 Alicante 418148. 22676 47 116993. (3.657e+05,4.181e+05]
4 Almería 142338. 3902 15 123451. (1.418e+05,1.423e+05]
5 Araba - Álava 243705. 786 41 124978. (2.343e+05,2.437e+05]
6 Asturias 149180. 3502 20 128909. (1.486e+05,1.492e+05]
7 Badajoz 147803. 2104 18 129788. (1.431e+05,1.478e+05]
8 Barcelona 462814. 35906 49 131396. (4.337e+05,4.628e+05]
9 Bizkaia 286462. 3345 43 132624. (2.513e+05,2.865e+05]
10 Burgos 134636. 1255 10 134636. (1.326e+05,1.346e+05]
# … with 42 more rows

Compute the quintiles then use cut:
x <- quantile(df$mean_price, seq(0, 1, 0.2))
# 0% 20% 40% 60% 80% 100%
# 113856.4 138227.5 157455.2 183174.7 249757.5 749037.5
df <- df %>%
mutate(
bins = ntile(mean_price, 5),
grp = cut(mean_price, breaks = x))
table(df$bins)
# 1 2 3 4 5
# 11 11 10 10 10
table(df$grp)
# (1.14e+05,1.38e+05] (1.38e+05,1.57e+05] (1.57e+05,1.83e+05] (1.83e+05,2.5e+05]
# 10 10 10 10
# (2.5e+05,7.49e+05]
# 11

Related

Leaflet choropleth plot not plotting as expected -

I have some geographical data that I would like to plot in a leaflet plot.
# A tibble: 52 × 2
provincia mean_price
<chr> <dbl>
1 A Coruña 179833.
2 Albacete 148584.
3 Alicante 418148.
4 Almería 142338.
I can get the geographic coordinates from the mapSpain package.
library(mapSpain)
provincias = mapSpain::esp_get_prov() %>%
select(c("cpro", "ine.prov.name", "iso2.prov.name.es", "iso2.prov.name.ca", "iso2.prov.name.ga", "iso2.prov.name.eu"))
This contains a few columns for the names of the regions plus the geographical coordinates.
I want to join my mean_price up with the geographical data - (since the data is not able to 100% match/join I loop over the columns to find the region which "fits". I run the following to join the two data sets up:
main_df <- map(names(provincias)[-1],
~ regex_left_join(df, provincias, by = c("provincia" = .x))) %>%
reduce(power_left_join, by = c("provincia", "mean_price"),
conflict = coalesce_xy)
This gets me my "main" dataframe which looks like:
provincia mean_price cpro ine.prov.name iso2.prov.name.es iso2.prov.name.ca iso2.pr…¹ iso2.…² geometry
<chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <GEOMETRY [°]>
1 A Coruña 179833. 15 Coruña, A La Coruña NA A Coruña NA GEOMETRYCOLLECTION EMPTY
2 Albacete 148584. 02 Albacete Albacete NA NA NA POLYGON ((-0.928884 38.7…
3 Alicante 418148. 03 Alicante/Alacant Alicante Alacant NA NA GEOMETRYCOLLECTION EMPTY
4 Almería 142338. 04 Almería Almería NA NA NA MULTIPOLYGON (((-1.63003…
5 Araba - Álava 243705. 01 Araba/Álava Álava NA NA Araba GEOMETRYCOLLECTION EMPTY
6 Asturias 149180. 33 Asturias Asturias NA NA NA MULTIPOLYGON (((-4.51230…
7 Badajoz 147803. 06 Badajoz Badajoz NA NA NA POLYGON ((-5.046996 38.7…
8 Barcelona 462814. 08 Barcelona Barcelona NA NA NA MULTIPOLYGON (((2.778513…
9 Bizkaia 286462. 48 Bizkaia NA NA NA Bizkaia MULTIPOLYGON (((-2.41284…
10 Burgos 134636. 09 Burgos Burgos NA NA NA MULTIPOLYGON (((-3.14139…
Now, I have my variable of interest mean_price matched with the geographical coordinates. I now want to plot a leaflet map but I am having difficultly.
I run the folllowing code:
main_df = main_df %>%
mutate(
bins = ntile(mean_price, 52) # I cretae bins for the choropleth colour pallet
) %>%
data.frame() %>%
sf::st_as_sf()
pal = colorBin("RdYlBu", domain = main_df$mean_price, bins = main_df$bins)
leaflet() %>%
addProviderTiles(providers$Stamen.Toner) %>%
setView(lng = 2.154007, lat = 41.390205, zoom = 12) %>%
addPolygons(data = main_df,
color = "darkblue",
weight = 1,
smoothFactor = 0.5,
fillOpacity = 0.8,
fillColor = pal(main_df$mean_price),
highlight = highlightOptions(
weight = 5,
color = "grey",
dashArray = "", # keep straightlines
fillOpacity = 0.7,
bringToFront = TRUE
)
)
Error in to_ring.default(x) : Don't know how to get polygon data
from object of class XY,GEOMETRYCOLLECTION,sfg In addition: Warning
messages: 1: In pal(main_df$mean_price) : Some values were outside
the color scale and will be treated as NA 2: sf layer has inconsistent
datum (+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs).
Need '+proj=longlat +datum=WGS84'
Question: How can I correctly plot the cloropleth plot in leaflet? and how can I adjust it to have say 10 bins?
When I adjust it to 10 bins I get this error:
Error in cut.default(x, binsToUse, labels = FALSE, include.lowest =
TRUE, : 'breaks' are not unique
Libraries:
library(leaflet)
library(dplyr)
library(ggmap)
library(rgdal)
library(htmltools)
library(mapview)
library(htmlwidgets)
library(powerjoin)
library(purrr)
library(fuzzyjoin)
Data:
df = structure(list(provincia = c("A Coruña", "Albacete", "Alicante",
"Almería", "Araba - Álava", "Asturias", "Badajoz", "Barcelona",
"Bizkaia", "Burgos", "Cantabria", "Castellón", "Ceuta", "Ciudad Real",
"Cuenca", "Cáceres", "Cádiz", "Córdoba", "Gipuzkoa", "Girona",
"Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears",
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo",
"Madrid", "Melilla", "Murcia", "Málaga", "Navarra", "Ourense",
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife",
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo",
"Valencia", "Valladolid", "Zamora", "Zaragoza", "Ávila"), mean_price = c(179833.167862839,
148583.87109077, 418148.151437643, 142337.792926704, 243704.750636132,
149179.732438607, 147802.894486692, 462813.775190776, 286461.604484305,
134635.540239044, 201214.437926878, 139819.15323646, 204318.181818182,
113856.401715511, 116992.998609179, 143137.441025641, 251270.65520481,
166586.746650426, 340663.440746753, 365733.360551724, 219325.194605466,
173221.569037657, 142430.992912371, 141754.603535354, 749037.540922619,
131396.292428198, 160838.235963581, 656553.777123633, 140312.919506463,
160485.155614973, 129787.667711599, 433665.652781242, 234306.12244898,
188635.237652749, 687127.622056842, 197388.050991501, 137829.640394089,
123450.853813559, 230344.604904632, 159999.046077894, 328912.419004007,
184829.536144578, 180692.511036468, 155759.235955056, 193651.956693536,
128909.006756757, 132623.629553967, 177825.529404212, 165240.297002725,
116820.125531915, 164934.932635983, 124977.724215247)), row.names = c(NA,
-52L), class = c("tbl_df", "tbl", "data.frame"))
EDIT:
provincias <- sf::st_transform(provincias, crs = 4326)
n_colors <- 52
my_palette <- colorRampPalette(brewer.pal(n_colors, "RdYlBu"))
pal = colorBin(my_palette(n_colors), domain = spain_summary$mean_price, bins = spain_summary$bins, na.color = "transparent", alpha = 1)
leaflet() %>%
addProviderTiles(providers$Stamen.Toner) %>%
addPolygons(data = provincias,
weight = 1,
smoothFactor = 0.5,
color = "white",
fillOpacity = 0.5,
fillColor = ~pal(spain_summary$mean_price),
stroke = TRUE,
highlight = highlightOptions(
color = "black",
weight = 2,
#dashArray = "",
fillOpacity = 0.8,
bringToFront = TRUE
)
)

Is it possible to pivot_longer multiple columns with one name column and two values columns?

This is how my data looks like with columns matching a gen ID and a timepoint (_01 or _03)
id sample_name placa_ppard_01 placa_pparg_01 placa_nr1h3_01
1 50109018 A2535500018000006001 17.17521 24.10186 22.05157
2 50109019 A2535500019000006001 17.17521 24.10186 22.05157
3 50109025 A2535500025000006001 17.17521 24.10186 22.05157
4 50109026 A2535500026000006001 17.17521 24.10186 22.05157
5 50109027 A2535500027000006001 17.33383 24.38065 21.98426
6 50118001 A2536300001000006001 17.17521 24.10186 22.05157
placa_nr1h2_01 placa_ldlr_03 placa_pcsk9_03 placa_olr1_03 placa_msr1_03
1 18.41557 18.89107 NA 23.93779 20.58086
2 18.41557 18.89107 NA 23.93779 20.58086
3 18.41557 18.89107 NA 23.93779 20.58086
4 18.41557 18.89107 NA 23.93779 20.58086
5 18.37935 19.12857 NA 23.22024 20.36100
6 18.41557 18.89107 NA 23.93779 20.58086
### a small piece of data
df <- structure(list(id = structure(c(130102009, 50203017, 50508027,
140102087, 70201047, 50109027, 130102012, 120715030, 130106034,
120715037, 60901020, 60901023, 140102092, 50203030, 50203004), format.spss = "F10.0", display_width = 10L),
placa_rxrb_01 = structure(c(17.3644166666667, 17.4306458333333,
17.2125, 17.4306458333333, 16.8459791666667, 17.3644166666667,
17.4306458333333, 17.3336875, 17.4306458333333, 17.4459166666667,
17.3644166666667, 17.3644166666667, 17.3336875, 17.2125,
17.4306458333333), label = "lab_exp_genica: ct_cal_placa_RXRB Hs00232774_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cyp27a1_01 = structure(c(18.1048043478261, 18.0184893617021,
17.9057708333333, 18.0184893617021, 17.6525625, 18.1048043478261,
18.0184893617021, 17.7276875, 18.0184893617021, 17.5819565217391,
18.1048043478261, 18.1048043478261, 17.7276875, 17.9057708333333,
18.0184893617021), label = "lab_exp_genica: ct_cal_placa_CYP27A1 Hs01017992_g1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_abca1_01 = structure(c(18.0373958333333, 17.7479791666667,
17.9839791666667, 17.7479791666667, 17.7181041666667, 18.0373958333333,
17.7479791666667, 17.8329583333333, 17.7479791666667, 17.9318260869565,
18.0373958333333, 18.0373958333333, 17.8329583333333, 17.9839791666667,
17.7479791666667), label = "lab_exp_genica: ct_cal_placa_ABCA1 Hs01059101_m1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_scarb1_01 = structure(c(NA, 20.2697659574468, 20.5369583333333,
20.2697659574468, 20.1755208333333, 20.6629111111111, 20.2697659574468,
20.471829787234, 20.2697659574468, 20.2979166666667, 20.6629111111111,
20.6629111111111, 20.471829787234, 20.5369583333333, 20.2697659574468
), label = "lab_exp_genica: ct_cal_placa_SCARB1 Hs00969821_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cav1_01 = structure(c(23.9575208333333, 23.5285625,
24.01425, 23.5285625, 23.1588541666667, 23.9575208333333,
23.5285625, 23.5898125, 23.5285625, 23.513875, 23.9575208333333,
23.9575208333333, 23.5898125, 24.01425, 23.5285625), label = "lab_exp_genica: ct_cal_placa_CAV1 Hs00971716_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_nfkb1_01 = structure(c(16.8749583333333, 16.6195, 16.7741041666667,
16.6195, 16.1903125, 16.8749583333333, 16.6195, 16.608375,
16.6195, NA, 16.8749583333333, 16.8749583333333, 16.608375,
16.7741041666667, 16.6195), label = "lab_exp_genica: ct_cal_placa_NFKB1 Hs00765730_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_tgfb2_03 = structure(c(24.0563043478261, 23.6882553191489,
24.1700416666667, 23.6882553191489, 23.4935416666667, 24.0563043478261,
23.6882553191489, 24.02575, 23.6882553191489, 23.7865208333333,
24.0563043478261, 24.0563043478261, 24.02575, 24.1700416666667,
23.6882553191489), label = "lab_exp_genica: ct_cal_placa_TGFB2 Hs00234244_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_ido_03 = structure(c(19.5457916666667, 19.4714375,
19.6080212765957, 19.4714375, 18.3972708333333, 19.5457916666667,
19.4714375, 19.4884166666667, 19.4714375, 19.070447368421,
19.5457916666667, 19.5457916666667, 19.4884166666667, 19.6080212765957,
19.4714375), label = "lab_exp_genica: ct_cal_placa_IDO Hs00984148_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_cdkn2a_03 = structure(c(21.1584583333333, 21.0697291666667,
21.1403541666667, 21.0697291666667, 20.7684680851064, 21.1584583333333,
21.0697291666667, 21.1548125, 21.0697291666667, 21.0899189189189,
21.1584583333333, 21.1584583333333, 21.1548125, 21.1403541666667,
21.0697291666667), label = "lab_exp_genica: ct_cal_placa_CDKN2A Hs00923894_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_abcg1_03 = structure(c(18.7376808510638, 18.3815957446809,
18.6998333333333, 18.3815957446809, 18.1750833333333, 18.7376808510638,
18.3815957446809, 18.5114893617021, 18.3815957446809, 18.3424375,
18.7376808510638, 18.7376808510638, 18.5114893617021, 18.6998333333333,
18.3815957446809), label = "lab_exp_genica: ct_cal_placa_ABCG1 Hs00245154_m1 Ct 1 year", format.spss = "F10.7", display_width = 10L),
placa_abcg4_03 = structure(c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, 27.677, NA, NA, NA, NA, NA), label = "lab_exp_genica: ct_cal_placa_ABCG4 Hs00223446_m1 Ct 1 year", format.spss = "F6.3"),
placa_nfe2l2_03 = structure(c(17.2223404255319, 16.8399787234043,
17.1038541666667, 16.8399787234043, 16.5560208333333, 17.2223404255319,
16.8399787234043, 16.9331063829787, 16.8399787234043, 17.1034166666667,
17.2223404255319, 17.2223404255319, 16.9331063829787, 17.1038541666667,
16.8399787234043), label = "lab_exp_genica: ct_cal_placa_NFE2L2 Hs00975961_g1 Ct 1 year", format.spss = "F10.7", display_width = 10L)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
As you can see the pattern is placa_gen_time. I want something like this:
gen_cal cal01 cal03
ppard 17.17521 NA
pparg 24.10186 NA
olr1 NA 23.93779
msr1 NA 20.58086
I guess it could be done in more than one step with sthg like this (genes2 my database). But I am pretty sure that pivot_longer function allows to transform the colnames into 2 columns for values and one column for the name (=genes)
genes2 <- genes2 %>% pivot_longer(cols = matches("placa_"), names_to = c("gen_cal", "time_cal"), names_sep = "_0",values_to = "Ct_cal")
> head
gen_cal time_cal Ct_cal
<chr> <chr> <dbl>
1 ppara 1 19.7
2 ppard 1 17.2
3 pparg 1 24.1
4 nr1h3 1 22.1
5 nr1h2 1 18.4
6 rxra 1 14.3
# And then pivot_wider the column time and Ct_cal into cal01 and cal03
We need to reshape twice, wide-to-long, split string, then reshape it again long-to-wide, using data.table:
library(data.table)
setDT(df)
dcast(
melt(df, id.vars = "id")[, c("gene", "time") := tstrsplit(variable, split = "_", fixed = TRUE, keep = 2:3)],
id + gene ~ time)
# id gene 01 03
# 1: 50109027 abca1 18.03740 NA
# 2: 50109027 abcg1 NA 18.73768
# 3: 50109027 abcg4 NA NA
# 4: 50109027 cav1 23.95752 NA
# 5: 50109027 cdkn2a NA 21.15846
# ---
# 176: 140102092 nfe2l2 NA 16.93311
# 177: 140102092 nfkb1 16.60837 NA
# 178: 140102092 rxrb 17.33369 NA
# 179: 140102092 scarb1 20.47183 NA
# 180: 140102092 tgfb2 NA 24.02575
Seems like you are looking for .value in the names_to argument.
library(tidyverse)
df %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)") %>%
setNames(c("id", "gene", "cal01", "cal03"))
Update
If we do not use setNames after the above pivot_longer, the pivoted column names would only contain numbers, which is not allowed in base R data frame (no problem for tibble).
# A tibble: 180 × 4
id gene `01` `03`
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
A workaround for the column names is to rename them prior to pivot_longer to attach the string "cal" before the "0".
df %>%
rename_with(.cols = contains("_0"), ~sub("_0", "_cal0", .x)) %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)")
Output
# A tibble: 180 × 4
id gene cal01 cal03
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
2 130102009 cyp27a1 18.1 NA
3 130102009 abca1 18.0 NA
4 130102009 scarb1 NA NA
5 130102009 cav1 24.0 NA
6 130102009 nfkb1 16.9 NA
7 130102009 tgfb2 NA 24.1
8 130102009 ido NA 19.5
9 130102009 cdkn2a NA 21.2
10 130102009 abcg1 NA 18.7
# … with 170 more rows
# ℹ Use `print(n = ...)` to see more rows

Is there a way to create a correlation matrix for showing correlations among the average store sales for all product categories?

I'm trying to create a correlation matrix for showing correlations among the average store sales for all product categories. The Product Categories are columns 10-18.
Here is my head(df2):
> head(df2)
store city region province size revenue units cost gross_profit promo_units energy_units regularBars_units
1 105 BROCKVILLE ONTARIO ON 496 984.70 470.46 590.73 393.97 210.23 72.13 38.63
2 117 BURLINGTON ONTARIO ON 875 2629.32 1131.38 1621.58 1007.74 401.46 192.77 75.04
3 122 BURLINGTON ONTARIO ON 691 2786.73 1229.46 1709.45 1077.27 450.04 240.48 93.73
4 123 BURLINGTON ONTARIO ON 763 2834.49 1257.63 1719.61 1114.88 476.83 194.21 99.44
5 182 DON MILLS ONTARIO ON 784 4118.36 1949.50 2485.83 1632.53 664.71 199.73 175.48
7 186 NORTH YORK ONTARIO ON 966 8195.26 3695.46 5069.99 3125.27 1143.33 419.19 271.58
gum_units bagpegCandy_units isotonics_units singleServePotato_units takeHomePotato_units kingBars_units flatWater_units
1 29.29 13.38 20.69 18.60 7.71 17.87 56.54
2 55.85 42.15 87.62 36.44 33.46 47.44 98.42
3 64.27 29.85 105.65 47.96 19.90 45.21 130.27
4 73.25 54.15 118.19 39.67 22.10 45.33 132.77
5 145.81 68.06 109.35 85.71 42.33 79.81 204.06
7 212.42 153.90 166.37 130.79 136.79 114.50 328.63
psd591Ml_units
1 39.71
2 38.73
3 47.31
4 39.87
5 50.29
7 112.38
Here is my dput(df2):
> dput(head(df2,4))
structure(list(store = c(105L, 117L, 122L, 123L), city = c("BROCKVILLE",
"BURLINGTON", "BURLINGTON", "BURLINGTON"), region = c("ONTARIO",
"ONTARIO", "ONTARIO", "ONTARIO"), province = c("ON", "ON", "ON",
"ON"), size = c(496L, 875L, 691L, 763L), revenue = c(984.7, 2629.32,
2786.73, 2834.49), units = c(470.46, 1131.38, 1229.46, 1257.63
), cost = c(590.73, 1621.58, 1709.45, 1719.61), gross_profit = c(393.97,
1007.74, 1077.27, 1114.88), promo_units = c(210.23, 401.46, 450.04,
476.83), energy_units = c(72.13, 192.77, 240.48, 194.21), regularBars_units = c(38.63,
75.04, 93.73, 99.44), gum_units = c(29.29, 55.85, 64.27, 73.25
), bagpegCandy_units = c(13.38, 42.15, 29.85, 54.15), isotonics_units = c(20.69,
87.62, 105.65, 118.19), singleServePotato_units = c(18.6, 36.44,
47.96, 39.67), takeHomePotato_units = c(7.71, 33.46, 19.9, 22.1
), kingBars_units = c(17.87, 47.44, 45.21, 45.33), flatWater_units = c(56.54,
98.42, 130.27, 132.77), psd591Ml_units = c(39.71, 38.73, 47.31,
39.87)), na.action = structure(c(`6` = 6L, `169` = 169L, `173` = 173L,
`177` = 177L, `182` = 182L, `191` = 191L, `193` = 193L, `195` = 195L,
`196` = 196L, `198` = 198L, `204` = 204L, `277` = 277L, `385` = 385L,
`452` = 452L, `527` = 527L, `601` = 601L), class = "omit"), row.names = c(NA,
4L), class = "data.frame")
you can try with:
cor(df2[,10:18])
which will give you the following output :
promo_units energy_units regularBars_units gum_units bagpegCandy_units isotonics_units
promo_units 1.0000000 0.9341821 0.9910344 0.9909434 0.8449146 0.9993738
energy_units 0.9341821 1.0000000 0.9174646 0.8830945 0.6324464 0.9223899
regularBars_units 0.9910344 0.9174646 1.0000000 0.9929161 0.8075351 0.9932908
gum_units 0.9909434 0.8830945 0.9929161 1.0000000 0.8711067 0.9950740
bagpegCandy_units 0.8449146 0.6324464 0.8075351 0.8711067 1.0000000 0.8532315
isotonics_units 0.9993738 0.9223899 0.9932908 0.9950740 0.8532315 1.0000000
singleServePotato_units 0.9317931 0.9922160 0.9317428 0.8901737 0.5995540 0.9225072
takeHomePotato_units 0.6708652 0.6744549 0.5657719 0.6040891 0.7302509 0.6543614
kingBars_units 0.9459992 0.9363726 0.8960974 0.9021115 0.8223868 0.9360713
singleServePotato_units takeHomePotato_units kingBars_units
promo_units 0.9317931 0.6708652 0.9459992
energy_units 0.9922160 0.6744549 0.9363726
regularBars_units 0.9317428 0.5657719 0.8960974
gum_units 0.8901737 0.6040891 0.9021115
bagpegCandy_units 0.5995540 0.7302509 0.8223868
isotonics_units 0.9225072 0.6543614 0.9360713
singleServePotato_units 1.0000000 0.5804878 0.8960888
takeHomePotato_units 0.5804878 1.0000000 0.8649255
kingBars_units 0.8960888 0.8649255 1.0000000
Explanation:
You are using cor() function which calculate the correlation between the elements its receive as inputs. In this case, the input is df2[,10:18], which are the columns 10 to 18 of your df2 dataframe.

R Combinig two For Loops (Nested For Loop)

I've been trying to combine the two For Loops into a single loop.
Loop 1:
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
}
Loop 2:
#loop for the calculations
c <- rep(0, nrow(BR))
for (ii in 1:nrow(BR)) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(BR$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
BR <- cbind(BR,c)
The BR need to be Unique.Order.Comment[i] in Loop 2.
What I believe/hope it would look like should be the below.
But I get the error message Error in rep(0, nrow(Unique.Order.Comment[i])) : invalid 'times' argument
(What I think it should look like)
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
length(Unique.Order.Comment)
#loop for the calculations
for (i in 1:length(Unique.Order.Comment)) {
#a <- i-11
#c[i] <- print(sum(n.Cases.per.month$nCases[a:i]))
a <- subset.data.frame(Rebuilt.Data, Rebuilt.Order.Comment == Unique.Order.Comment[i])
assign(Unique.Order.Comment[i],a)
#loop for the calculations
c <- rep(0, nrow(Unique.Order.Comment[i]))
for (ii in 1:nrow(Unique.Order.Comment[i])) {
if (ii < 12){
print(0)
}else {
a <- ii-11
c[ii] <- print(sum(Unique.Order.Comment[i]$Number.Cases.Authorised[a:ii]))
}
}
c <- data.frame(c)
c <- c %>%
rename(
n.Seen.Cum = c
)
#View(c)
Unique.Order.Comment[i] <- cbind(Unique.Order.Comment[i],c)
}
Edit example data:
dput(Unique.Order.Comment)
c("CN", "DM", "DR", "FF", "PG", "HN", "SK", "GI", "GYN", "BR",
"UR", "LYMPH", "HPB", "BST", "ENDOC", "PAEDGI", "CT", "PERI",
"NEURO", "MOHS", "ICC", "RE", "PAED", "MN", "EMR", "PR", "LBX",
"HAEM", "CTT", "UGI", "NEUR", "URGI", "GYNAE")
dput(head(Rebuilt.Data))
structure(list(Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR",
"BR", "BR"), Period.Received = c("2019-01", "2019-02", "2019-03",
"2019-04", "2019-05", "2019-06"), Number.Cases.Received = c(838L,
730L, 778L, 832L, 574L, 626L), Number.Cases.Authorised = c(680L,
587L, 896L, 715L, 761L, 554L), Number.Cases.Authorised.Less7Days = c(550L,
343L, 520L, 389L, 393L, 374L), Number.Cases.Authorised.Less10.Days = c(628L,
475L, 723L, 595L, 555L, 474L), Percentage.Authorsied.Less7Days = c(0.808823529411765,
0.584327086882453, 0.580357142857143, 0.544055944055944, 0.516425755584757,
0.675090252707581), Percentage.Authorsied.Less10Days = c(0.923529411764706,
0.809199318568995, 0.806919642857143, 0.832167832167832, 0.729303547963206,
0.855595667870036), Avg.TaT.for.Authorised.Cases = structure(c(5.26470588235294,
8.74616695059625, 8.34709821428571, 8.09370629370629, 12.826544021025,
6.22021660649819), class = "difftime", units = "days"), MDM.Received = c(2L,
13L, 2L, NA, NA, 5L), MDM.Received.Avg.TAT = structure(c(5, 29.2307692307692,
0.5, NA, NA, 5.4), class = "difftime", units = "days"), So.Received = c(NA,
1L, NA, 1L, NA, 2L), So.Received.Avg.TAT = structure(c(NA, 14,
NA, 9, NA, 54), class = "difftime", units = "days")), row.names = c(NA,
6L), class = "data.frame")
if I place print(Unique.Order.Comment[i]) before the second seperate loop I get:
"CN"
In theory the first loop subsets data based upon a unique list of Order.Comment (which it can do).
Then it does a cumlative sum and this gets cbind onto the subsetted data.
First, it is easier to help if you provide a small example along with the your expected output. You can share your original data removing the columns which are not necessary to the question or create a fake dataset which is similar to your original data.
Second, I think you are overcomplicating this. It is never a good idea to create multiple datasets in your global environment. They are very difficult to manage and unnecessary pollute the global environment. You can use lists instead.
In this case I don't think we need to split the datasets in different lists as we have different packages that can perform rolling calculations. For example, below I have used zoo package which has rollsumr function.
library(dplyr)
library(zoo)
df <- df %>%
group_by(Rebuilt.Order.Comment) %>%
mutate(n.Seen.Cum = rollsumr(Number.Cases.Authorised, 12, fill = 0)) %>%
ungroup
df
# Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
# <chr> <chr> <int> <int>
# 1 BR 2019-01 680 0
# 2 BR 2019-02 587 0
# 3 BR 2019-03 896 0
# 4 BR 2019-04 715 0
# 5 BR 2019-05 761 0
# 6 BR 2019-06 554 0
# 7 BR 2019-07 843 0
# 8 BR 2019-08 815 0
# 9 BR 2019-09 704 0
#10 BR 2019-10 939 0
#11 BR 2019-11 834 0
#12 BR 2019-12 880 9208
#13 BR 2020-01 801 9329
#14 BR 2020-02 610 9352
#15 BR 2020-03 853 9309
I think I see what you are aiming for, but I may have missed something. Let me know, and I can edit.
From what I can tell, you only need one loop, and instead of assign()ing a bunch of dataframes, you can iteratively build a summary table.
edit
The other answer here is quite elegant! I'm updating my answer based on your new comments just for fun. Not sure why we have different n.Seen.Cum values...
df2 <- structure(list(
Rebuilt.Order.Comment = c("BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR" ),
Period.Received = c("2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02", "2020-03"),
Number.Cases.Authorised = c(680L, 587L, 896L, 715L, 761L, 554L, 843L, 815L, 704L, 939L, 834L, 880L, 801L, 610L, 853L),
n.Seen.Cum = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9208, 9329, 9352, 9309)),
row.names = c(NA, 15L), class = "data.frame")
# This will hold results
output <- list()
# Loop over this vector
Unique.Order.Comment <- unique(df2$Rebuilt.Order.Comment)
for(comment in Unique.Order.Comment){
# Temporary dataframe that is subset of 'df2'
temp <- df2[df2$Rebuilt.Order.Comment == comment,]
# We can do arithmetic with dates that have days
temp$Period.Received2 <- as.Date(paste(temp$Period.Received, "-01", sep=""))
# Calculate cumsum after 333 days have passed
temp$n.Seen.cum2 <- ifelse(
test = temp$Period.Received2 - min(temp$Period.Received2) > 333,
yes = cumsum(temp$Number.Cases.Authorised),
no = NA)
# better
output[[comment]] <- temp
# quick and dirty
# assign(x = comment, value = temp)
}
output[[1]]
#> Rebuilt.Order.Comment Period.Received Number.Cases.Authorised n.Seen.Cum
#> 1 BR 2019-01 680 0
#> 2 BR 2019-02 587 0
#> 3 BR 2019-03 896 0
#> 4 BR 2019-04 715 0
#> 5 BR 2019-05 761 0
#> 6 BR 2019-06 554 0
#> 7 BR 2019-07 843 0
#> 8 BR 2019-08 815 0
#> 9 BR 2019-09 704 0
#> 10 BR 2019-10 939 0
#> 11 BR 2019-11 834 0
#> 12 BR 2019-12 880 9208
#> 13 BR 2020-01 801 9329
#> 14 BR 2020-02 610 9352
#> 15 BR 2020-03 853 9309
#> Period.Received2 n.Seen.cum2
#> 1 2019-01-01 NA
#> 2 2019-02-01 NA
#> 3 2019-03-01 NA
#> 4 2019-04-01 NA
#> 5 2019-05-01 NA
#> 6 2019-06-01 NA
#> 7 2019-07-01 NA
#> 8 2019-08-01 NA
#> 9 2019-09-01 NA
#> 10 2019-10-01 NA
#> 11 2019-11-01 NA
#> 12 2019-12-01 9208
#> 13 2020-01-01 10009
#> 14 2020-02-01 10619
#> 15 2020-03-01 11472
If you have multiple years and want the cumulative sum to reset, update the test parameter in ifelse() to include some max number of days.

adding half of the value of one row by different groups in a dataframe

I have the following dataframe :
structure(list(trial = c("ES1-7", "ES1-7", "ES1-7", "ES14-25",
"ES14-25", "ES14-25", "ES26-38", "ES26-38", "ES26-38", "ES8-13",
"ES8-13", "ES8-13", "SA1-13", "SA1-13", "SA1-13", "SA14-25",
"SA14-25", "SA14-25"), marker = c("0", "1", "2", "0", "1", "2",
"0", "1", "2", "0", "1", "2", "0", "1", "2", "0", "1", "2"),
n = c(873L, 269L, 2114L, 2300L, 673L, 5959L, 3410L, 1200L,
7930L, 599L, 173L, 1538L, 2834L, 509L, 6953L, 3082L, 757L,
6875L), prop = c(0.268120393120393, 0.0826167076167076, 0.649262899262899,
0.257501119570085, 0.0753470667263771, 0.667151813703538,
0.271929824561404, 0.0956937799043062, 0.63237639553429,
0.259307359307359, 0.0748917748917749, 0.665800865800866,
0.275252525252525, 0.0494366744366744, 0.6753108003108, 0.287661004293448,
0.0706552174724659, 0.641683778234086)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -18L), vars = "trial",
labels = structure(list(
trial = c("ES1-7", "ES14-25", "ES26-38", "ES8-13", "SA1-13",
"SA14-25")), row.names = c(NA, -6L), class = "data.frame", vars = "trial",
drop = TRUE), indices = list(
0:2, 3:5, 6:8, 9:11, 12:14, 15:17), drop = TRUE, group_sizes = c(3L,
3L, 3L, 3L, 3L, 3L), biggest_group_size = 3L)
It looks like that :
# A tibble: 6 x 4
# Groups: trial [2]
trial marker n prop
<chr> <chr> <int> <dbl>
1 ES1-7 0 873 0.268
2 ES1-7 1 269 0.0826
3 ES1-7 2 2114 0.649
4 ES14-25 0 2300 0.258
5 ES14-25 1 673 0.0753
6 ES14-25 2 5959 0.667
I want to group_by per trial and add half of the prop value when marker equals 1 to the other rows when marker equals 0 or 2.
For example when grouping by ES1-7 I would have the following prop value: 0.268+(0.0826/2) when marker equals 0.
An expected output for the groups ES1-7 and ES14-25 would be :
# A tibble: 6 x 4
# Groups: trial [2]
trial marker n prop
<chr> <chr> <int> <dbl>
1 ES1-7 0 873 0.268+(0.0826/2)
2 ES1-7 1 269 0.0826
3 ES1-7 2 2114 0.649+(0.0826/2)
4 ES14-25 0 2300 0.258+(0.0753/2)
5 ES14-25 1 673 0.0753
6 ES14-25 2 5959 0.667+(0.0753/2)
Also a dplyr possibility:
df %>%
group_by(trial) %>%
mutate(prop = ifelse(marker != 1, prop + prop[marker == 1]/2, prop))
trial marker n prop
<chr> <chr> <int> <dbl>
1 ES1-7 0 873 0.309
2 ES1-7 1 269 0.0826
3 ES1-7 2 2114 0.691
4 ES14-25 0 2300 0.295
5 ES14-25 1 673 0.0753
6 ES14-25 2 5959 0.705
7 ES26-38 0 3410 0.320
8 ES26-38 1 1200 0.0957
9 ES26-38 2 7930 0.680
10 ES8-13 0 599 0.297
Did it in two steps.
First grouped by trial and filtered on marker ==1, calculated 1/2 prop.
Then joined this with original data.frame using trial as key and
calculated prop.new = prop +prop.half.
This can be simplified to one step process, but figured this would be good start for you to experiment more with the code
esl, of course is your data.frame
esl %>% group_by(trial) %>%
filter(marker ==1) %>%
mutate(prop.half = prop/2) %>% select(trial,marker,prop.half) -> esl.half
left_join(esl, esl.half, by="trial") %>%
mutate(prop.new = prop+prop.half)

Resources