I'm trying to find the R procedure that is equivalent to Google Sheets' Pivot Group Rule. That is, I want to summarize that data by discretizing a numerical column with a fixed interval size that I decide on.
I am almost getting the desired output, but am having a trouble with the "(a,b]" interval notation.
Example
df <-
data.frame(
num_col = c(1400,9000,15000,17350,20000,22000,
25000,40000,42000,45000,50000,60000,65000,70000,75000,
1e+05,120000,125000,150000,168000,180000,2e+05,225000,
250000,270000,290000,3e+05,350000,4e+05,427000,450000,5e+05,
550000,560000,6e+05,625000,650000,7e+05,750000,8e+05,
850000,9e+05,913000,930000,950000,990000,1e+06,1066167,
1100000,1200000,1250000,1300000,1400000,1420000,1500000,
1700000,1750000,1800000,1900000,1950000,2e+06,2100000,
2300000,2400000,2450000,2500000,3e+06,3150000,3200000,
3300000,3400000,3440000,3500000,3660000,3800000,3850000,
4e+06,4400000,4500000,4600000,4700000,4800000,4900000,5e+06,
5500000,6e+06,6400000,6500000,6600000,6800000,6900000,
7e+06,7200000,7217600,7400000,7500000,7700000,8e+06,
8200000,8495000,8500000,8700000,8900000,9e+06,9200000,9500000,
9600000,1e+07,10500000,10818775,1.1e+07,11500000,
1.2e+07,12500000,12620000,1.3e+07,13200000,13400000,13500000,
1.4e+07,14500000,14800000,1.5e+07,1.6e+07,1.7e+07,17500000,
1.8e+07,18026148,18500000,1.9e+07,19500000,19800000,
19900000,2e+07,2.1e+07,2.2e+07,22500000,2.3e+07,2.4e+07,
2.5e+07,25500000,2.6e+07,2.7e+07,27220000,2.8e+07,2.9e+07,
3e+07,30300000,3.1e+07,31500000,3.2e+07,32500000,3.3e+07,
3.4e+07,3.5e+07,3.6e+07,3.7e+07,3.8e+07,38600000,3.9e+07,
39200000,4e+07,4.1e+07,4.2e+07,4.3e+07,4.4e+07,44500000,
4.5e+07,4.6e+07,4.7e+07,4.8e+07,4.9e+07,49900000,5e+07,
50100000,50200000,5.2e+07,5.3e+07,5.5e+07,5.6e+07,5.7e+07,
5.8e+07,58800000,6e+07,6.1e+07,6.3e+07,6.5e+07,6.6e+07,
6.8e+07,68005000,6.9e+07,7e+07,7.3e+07,7.4e+07,7.5e+07,
7.6e+07,7.8e+07,7.9e+07,8e+07,81200000,8.2e+07,8.4e+07,
8.5e+07,8.8e+07,9e+07,9.2e+07,9.3e+07,9.4e+07,9.5e+07,
9.9e+07,1e+08,1.02e+08,1.03e+08,1.05e+08,1.08e+08,1.1e+08,
1.12e+08,1.15e+08,1.17e+08,1.2e+08,1.25e+08,1.27e+08,
1.3e+08,1.32e+08,1.35e+08,1.4e+08,1.44e+08,1.45e+08,1.5e+08,
1.55e+08,1.6e+08,1.65e+08,1.7e+08,1.75e+08,1.76e+08,
1.78e+08,1.8e+08,1.85e+08,1.9e+08,1.95e+08,2e+08,2.09e+08,
2.1e+08,2.15e+08,2.2e+08,2.25e+08,2.3e+08,2.45e+08,2.5e+08,
2.6e+08,263700000,6e+08),
val = c(1,1,1,1,2,1,1,1,1,1,4,3,1,2,2,
8,1,4,4,1,1,7,1,11,1,1,6,2,2,1,3,21,1,1,3,
1,3,1,3,1,1,3,1,1,2,1,24,1,6,8,1,3,2,1,13,
1,1,4,1,1,22,3,1,1,1,13,27,1,2,3,2,1,12,1,1,
1,20,2,3,1,2,1,1,44,2,12,1,4,1,1,1,21,1,1,1,
3,1,15,1,1,5,1,1,8,1,2,1,43,1,1,11,1,24,2,
1,15,1,1,2,8,1,1,34,9,16,1,15,1,1,6,1,1,1,55,
3,11,1,4,5,40,1,9,3,1,14,3,38,1,3,1,7,1,2,
3,34,5,6,6,1,1,1,38,1,6,1,3,1,8,1,1,1,1,1,
25,1,1,3,1,11,1,1,5,1,18,4,1,12,2,4,1,2,11,1,
2,9,1,2,2,14,1,1,1,5,1,9,2,1,1,5,1,16,1,1,
3,1,8,1,2,1,8,7,1,8,1,8,4,1,6,14,2,4,6,8,4,
1,2,3,2,5,2,12,1,1,2,1,3,1,2,6,1,1,1)
)
look at the data
tibble::as_tibble(df)
#> # A tibble: 252 x 2
#> num_col val
#> <dbl> <dbl>
#> 1 1400 1
#> 2 9000 1
#> 3 15000 1
#> 4 17350 1
#> 5 20000 2
#> 6 22000 1
#> 7 25000 1
#> 8 40000 1
#> 9 42000 1
#> 10 45000 1
#> # ... with 242 more rows
desired output
desired_output <-
tibble::tribble(
~num_col_interval, ~val_sum,
"0 - 49999999", 962L,
"50000000 - 99999999", 164L,
"100000000 - 149999999", 78L,
"150000000 - 199999999", 53L,
"200000000 - 249999999", 23L,
"250000000 - 299999999", 8L,
"600000000 - 649999999", 1L
)
My attempt
library(dplyr)
library(ggplot2)
df |>
group_by(num_col_interval = ggplot2::cut_interval(num_col, length = 50000000 - 1, dig.lab = 10)) |>
summarise(across(val, sum))
#> # A tibble: 7 x 2
#> num_col_interval val
#> <fct> <dbl>
#> 1 [0,49999999] 962
#> 2 (49999999,99999998] 164
#> 3 (99999998,149999997] 78
#> 4 (149999997,199999996] 53
#> 5 (199999996,249999995] 23
#> 6 (249999995,299999994] 8
#> 7 (599999988,649999987] 1
You can see that the interval boundaries overlap. In the first row, it ranges 0 to 49999999, and in the second row, it ranges 49999999 to 99999998. I do understand the difference between ] and ( in the breaks notation. Nevertheless, I wish the ranges in the num_col_interval column to be as in desired_output.
How can I programatically format the num_col_interval values to be as in desired_output?
I'm mostly looking for a straightforward dplyr solution.
Here's how I would do it with Google Sheets, getting the desired output:
Several SO posts are relevant, but none of them answered my question:
How does cut with breaks work in R
Cut function in R - exclusive or am I double counting?
Cut by Defined Interval
Try this:
library(tidyverse)
df <-
data.frame(
num_col = c(1400,9000,15000,17350,20000,22000,
25000,40000,42000,45000,50000,60000,65000,70000,75000,
1e+05,120000,125000,150000,168000,180000,2e+05,225000,
250000,270000,290000,3e+05,350000,4e+05,427000,450000,5e+05,
550000,560000,6e+05,625000,650000,7e+05,750000,8e+05,
850000,9e+05,913000,930000,950000,990000,1e+06,1066167,
1100000,1200000,1250000,1300000,1400000,1420000,1500000,
1700000,1750000,1800000,1900000,1950000,2e+06,2100000,
2300000,2400000,2450000,2500000,3e+06,3150000,3200000,
3300000,3400000,3440000,3500000,3660000,3800000,3850000,
4e+06,4400000,4500000,4600000,4700000,4800000,4900000,5e+06,
5500000,6e+06,6400000,6500000,6600000,6800000,6900000,
7e+06,7200000,7217600,7400000,7500000,7700000,8e+06,
8200000,8495000,8500000,8700000,8900000,9e+06,9200000,9500000,
9600000,1e+07,10500000,10818775,1.1e+07,11500000,
1.2e+07,12500000,12620000,1.3e+07,13200000,13400000,13500000,
1.4e+07,14500000,14800000,1.5e+07,1.6e+07,1.7e+07,17500000,
1.8e+07,18026148,18500000,1.9e+07,19500000,19800000,
19900000,2e+07,2.1e+07,2.2e+07,22500000,2.3e+07,2.4e+07,
2.5e+07,25500000,2.6e+07,2.7e+07,27220000,2.8e+07,2.9e+07,
3e+07,30300000,3.1e+07,31500000,3.2e+07,32500000,3.3e+07,
3.4e+07,3.5e+07,3.6e+07,3.7e+07,3.8e+07,38600000,3.9e+07,
39200000,4e+07,4.1e+07,4.2e+07,4.3e+07,4.4e+07,44500000,
4.5e+07,4.6e+07,4.7e+07,4.8e+07,4.9e+07,49900000,5e+07,
50100000,50200000,5.2e+07,5.3e+07,5.5e+07,5.6e+07,5.7e+07,
5.8e+07,58800000,6e+07,6.1e+07,6.3e+07,6.5e+07,6.6e+07,
6.8e+07,68005000,6.9e+07,7e+07,7.3e+07,7.4e+07,7.5e+07,
7.6e+07,7.8e+07,7.9e+07,8e+07,81200000,8.2e+07,8.4e+07,
8.5e+07,8.8e+07,9e+07,9.2e+07,9.3e+07,9.4e+07,9.5e+07,
9.9e+07,1e+08,1.02e+08,1.03e+08,1.05e+08,1.08e+08,1.1e+08,
1.12e+08,1.15e+08,1.17e+08,1.2e+08,1.25e+08,1.27e+08,
1.3e+08,1.32e+08,1.35e+08,1.4e+08,1.44e+08,1.45e+08,1.5e+08,
1.55e+08,1.6e+08,1.65e+08,1.7e+08,1.75e+08,1.76e+08,
1.78e+08,1.8e+08,1.85e+08,1.9e+08,1.95e+08,2e+08,2.09e+08,
2.1e+08,2.15e+08,2.2e+08,2.25e+08,2.3e+08,2.45e+08,2.5e+08,
2.6e+08,263700000,6e+08),
val = c(1,1,1,1,2,1,1,1,1,1,4,3,1,2,2,
8,1,4,4,1,1,7,1,11,1,1,6,2,2,1,3,21,1,1,3,
1,3,1,3,1,1,3,1,1,2,1,24,1,6,8,1,3,2,1,13,
1,1,4,1,1,22,3,1,1,1,13,27,1,2,3,2,1,12,1,1,
1,20,2,3,1,2,1,1,44,2,12,1,4,1,1,1,21,1,1,1,
3,1,15,1,1,5,1,1,8,1,2,1,43,1,1,11,1,24,2,
1,15,1,1,2,8,1,1,34,9,16,1,15,1,1,6,1,1,1,55,
3,11,1,4,5,40,1,9,3,1,14,3,38,1,3,1,7,1,2,
3,34,5,6,6,1,1,1,38,1,6,1,3,1,8,1,1,1,1,1,
25,1,1,3,1,11,1,1,5,1,18,4,1,12,2,4,1,2,11,1,
2,9,1,2,2,14,1,1,1,5,1,9,2,1,1,5,1,16,1,1,
3,1,8,1,2,1,8,7,1,8,1,8,4,1,6,14,2,4,6,8,4,
1,2,3,2,5,2,12,1,1,2,1,3,1,2,6,1,1,1)
)
df |>
group_by(num_col_interval = cut_width(num_col, width = 50000000,
dig.lab = 10, closed = "left", boundary = 0)) |>
summarise(across(val, sum)) |>
separate(num_col_interval, into = c("left", "right"), sep = ",") |>
mutate(across(-val, parse_number),
right = if_else(right < max(right), right - 1L, right),
across(-val, ~ format(., scientific = FALSE)),
val = as.integer(val)) |>
unite(num_col_interval, left:right, sep = " - ")
#> # A tibble: 7 × 2
#> num_col_interval val
#> <chr> <int>
#> 1 " 0 - 49999999" 962
#> 2 " 50000000 - 99999999" 164
#> 3 "100000000 - 149999999" 78
#> 4 "150000000 - 199999999" 53
#> 5 "200000000 - 249999999" 23
#> 6 "250000000 - 299999999" 8
#> 7 "550000000 - 600000000" 1
Created on 2022-12-18 with reprex v2.0.2
I have a list of data frames -results1- where the data frames look like this (but with more rows)
names coefficients
1 ..a15.pdf 1.27679608
2 ..a17.pdf 1.05090176
I want to remove the dots before the variables in column 'names', i.e. change "..a15.pdf" to "a15.pdf".
I tried, with no success, different variations of
results1<-lapply(results1, function(x) {gsub("^.{0,2}", "", lapply(x, "[", "names"));x})
First two data frames from the list:
dput(results1[c(1,2)])
list(structure(list(names = c("..a15.pdf", "..a17.pdf", "..a18.pdf",
"..a21.pdf", "..a2TTT.pdf", "..a5.pdf", "..B11.pdf", "..B12.pdf",
"..B13.pdf", "..B22.pdf", "..B24.pdf", "..B4.pdf", "..B7.pdf",
"..B8.pdf", "..cw10-1.pdf", "..cw15-1TTT.pdf", "..cw17-1.pdf",
"..cw18.pdf", "..cw3.pdf", "..cw4.pdf", "..cw7_1TTT.pdf", "..cw13-1.pdf"
), coefficients = c(1.27679607834331, 1.05090175857491, 1.51820192474905,
2.30296037386815, 1.48568731934637, 0.493713103224402, 1.02705905465749,
0.999747360884078, 2.40828101927852, 0.695152132033603, 2.1436001615064,
2.25444037842867, 0.909773940025014, 1.14837173756827, -1.36323271003293,
0.341428535787024, -0.786878348480425, 0.793720472787986, -1.57831038567642,
0.277733503122777, -0.0364645818969112, -18.336668416705)), class = "data.frame", row.names = c(NA,
-22L)), structure(list(names = c("..a15.pdf", "..a17.pdf", "..a18.pdf",
"..a21.pdf", "..a2TTT.pdf", "..a5.pdf", "..B11.pdf", "..B12.pdf",
"..B13.pdf", "..B22.pdf", "..B24.pdf", "..B4.pdf", "..B7.pdf",
"..B8.pdf", "..cw10-1.pdf", "..cw15-1TTT.pdf", "..cw17-1.pdf",
"..cw18.pdf", "..cw3.pdf", "..cw4.pdf", "..cw7_1TTT.pdf", "..cw13-1.pdf"
), coefficients = c(2.096687569578, 2.19826038300833, 1.91814204277357,
0.801448541154512, 2.16169560949165, 1.48585130705963, 0.95126061691997,
1.93116618236938, 1.92555316191766, 1.00560861920225, 2.91129684208931,
2.75687804718002, 1.31164431967781, 2.22449059765255, -1.22629519335285,
1.31168579553008, -17.5786422399896, 1.25323523754693, -0.754445550651364,
0.555577381430987, 0.577850999404076, -34.2662973287062)), class = "data.frame", row.names = c(NA,
-22L)))
You have to escape . in gsub by using \\ two backslashes. This will replace all (any number of) dots preceeding the actual name.
results <- list(structure(list(names = c("..a15.pdf", "..a17.pdf", "..a18.pdf",
"..a21.pdf", "..a2TTT.pdf", "..a5.pdf", "..B11.pdf", "..B12.pdf",
"..B13.pdf", "..B22.pdf", "..B24.pdf", "..B4.pdf", "..B7.pdf",
"..B8.pdf", "..cw10-1.pdf", "..cw15-1TTT.pdf", "..cw17-1.pdf",
"..cw18.pdf", "..cw3.pdf", "..cw4.pdf", "..cw7_1TTT.pdf", "..cw13-1.pdf"
), coefficients = c(1.27679607834331, 1.05090175857491, 1.51820192474905,
2.30296037386815, 1.48568731934637, 0.493713103224402, 1.02705905465749,
0.999747360884078, 2.40828101927852, 0.695152132033603, 2.1436001615064,
2.25444037842867, 0.909773940025014, 1.14837173756827, -1.36323271003293,
0.341428535787024, -0.786878348480425, 0.793720472787986, -1.57831038567642,
0.277733503122777, -0.0364645818969112, -18.336668416705)), class = "data.frame", row.names = c(NA,
-22L)), structure(list(names = c("..a15.pdf", "..a17.pdf", "..a18.pdf",
"..a21.pdf", "..a2TTT.pdf", "..a5.pdf", "..B11.pdf", "..B12.pdf",
"..B13.pdf", "..B22.pdf", "..B24.pdf", "..B4.pdf", "..B7.pdf",
"..B8.pdf", "..cw10-1.pdf", "..cw15-1TTT.pdf", "..cw17-1.pdf",
"..cw18.pdf", "..cw3.pdf", "..cw4.pdf", "..cw7_1TTT.pdf", "..cw13-1.pdf"
), coefficients = c(2.096687569578, 2.19826038300833, 1.91814204277357,
0.801448541154512, 2.16169560949165, 1.48585130705963, 0.95126061691997,
1.93116618236938, 1.92555316191766, 1.00560861920225, 2.91129684208931,
2.75687804718002, 1.31164431967781, 2.22449059765255, -1.22629519335285,
1.31168579553008, -17.5786422399896, 1.25323523754693, -0.754445550651364,
0.555577381430987, 0.577850999404076, -34.2662973287062)), class = "data.frame", row.names = c(NA,
-22L)))
library(tidyverse)
results %>% map(~ .x %>% mutate(names = gsub('^\\.*(.*)$', '\\1', names)))
#> [[1]]
#> names coefficients
#> 1 a15.pdf 1.27679608
#> 2 a17.pdf 1.05090176
#> 3 a18.pdf 1.51820192
#> 4 a21.pdf 2.30296037
#> 5 a2TTT.pdf 1.48568732
#> 6 a5.pdf 0.49371310
#> 7 B11.pdf 1.02705905
#> 8 B12.pdf 0.99974736
#> 9 B13.pdf 2.40828102
#> 10 B22.pdf 0.69515213
#> 11 B24.pdf 2.14360016
#> 12 B4.pdf 2.25444038
#> 13 B7.pdf 0.90977394
#> 14 B8.pdf 1.14837174
#> 15 cw10-1.pdf -1.36323271
#> 16 cw15-1TTT.pdf 0.34142854
#> 17 cw17-1.pdf -0.78687835
#> 18 cw18.pdf 0.79372047
#> 19 cw3.pdf -1.57831039
#> 20 cw4.pdf 0.27773350
#> 21 cw7_1TTT.pdf -0.03646458
#> 22 cw13-1.pdf -18.33666842
#>
#> [[2]]
#> names coefficients
#> 1 a15.pdf 2.0966876
#> 2 a17.pdf 2.1982604
#> 3 a18.pdf 1.9181420
#> 4 a21.pdf 0.8014485
#> 5 a2TTT.pdf 2.1616956
#> 6 a5.pdf 1.4858513
#> 7 B11.pdf 0.9512606
#> 8 B12.pdf 1.9311662
#> 9 B13.pdf 1.9255532
#> 10 B22.pdf 1.0056086
#> 11 B24.pdf 2.9112968
#> 12 B4.pdf 2.7568780
#> 13 B7.pdf 1.3116443
#> 14 B8.pdf 2.2244906
#> 15 cw10-1.pdf -1.2262952
#> 16 cw15-1TTT.pdf 1.3116858
#> 17 cw17-1.pdf -17.5786422
#> 18 cw18.pdf 1.2532352
#> 19 cw3.pdf -0.7544456
#> 20 cw4.pdf 0.5555774
#> 21 cw7_1TTT.pdf 0.5778510
#> 22 cw13-1.pdf -34.2662973
Created on 2021-06-04 by the reprex package (v2.0.0)
Solution with tidyverse
library(purrr)
library(dplyr)
library(stringr)
map(results1, ~.x[]%>%
mutate(names = str_replace_all(names,"\\.\\.", "")))
[[1]]
names coefficients
1 a15.pdf 1.27679608
2 a17.pdf 1.05090176
3 a18.pdf 1.51820192
4 a21.pdf 2.30296037
5 a2TTT.pdf 1.48568732
6 a5.pdf 0.49371310
7 B11.pdf 1.02705905
8 B12.pdf 0.99974736
9 B13.pdf 2.40828102
10 B22.pdf 0.69515213
This worked for me:
lapply(your_list, function(df) dplyr::mutate(df, column = gsub(x = column, pattern = "pattern", replacement = "replacement")))
your_list- the list containing dataframes
column- the variable inside the dataframes where you want to do the gsub
In this way I found that the code did not change the class of the other variables in my dataframes