Dynamic creation of percentages for each column in r - r

I have the following data:
library(dplyr)
group_1 <- c(1,1,2,2,1,1,2,2)
group_2 <- c("A","A","A","A","B","B","B","B")
val <- c(sample(8))
xyz <- c(sample(8))
abc <- c(sample(8))
def <- c(sample(8))
ab23 <- c(sample(8))
df <- data.frame(group_1,group_2,val,xyz,abc,def,ab23)
df <- df %>% group_by(group_1,group_2) %>%
mutate(val_per = val/sum(val,na.rm = TRUE),
xyz_per = xyz/sum(xyz,na.rm = TRUE),
abc_per = abc/sum(abc,na.rm = TRUE),
def_per = def/sum(def,na.rm = TRUE),
ab23_per = ab23/sum(ab23,na.rm = TRUE))
I don't want to mutate new columns for creating percentages for each column. Is there a way in which new columns are create which have the percentage for each column.

We can also do this:
library(dplyr)
library(purrr)
df %>%
bind_cols(df %>%
select(!starts_with("group")) %>%
map_dfc(~ .x / sum(.x)) %>%
set_names(paste(names(.), "_per", sep = "")))
group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per def_per ab23_per
1 1 A 3 4 1 4 5 0.08333333 0.11111111 0.02777778 0.11111111 0.13888889
2 1 A 2 2 6 8 2 0.05555556 0.05555556 0.16666667 0.22222222 0.05555556
3 2 A 8 8 7 3 3 0.22222222 0.22222222 0.19444444 0.08333333 0.08333333
4 2 A 5 7 8 5 6 0.13888889 0.19444444 0.22222222 0.13888889 0.16666667
5 1 B 6 5 4 2 4 0.16666667 0.13888889 0.11111111 0.05555556 0.11111111
6 1 B 4 1 5 7 8 0.11111111 0.02777778 0.13888889 0.19444444 0.22222222
7 2 B 7 6 2 6 7 0.19444444 0.16666667 0.05555556 0.16666667 0.19444444
8 2 B 1 3 3 1 1 0.02777778 0.08333333 0.08333333 0.02777778 0.02777778

You can do this with across -
library(dplyr)
df %>%
group_by(group_1,group_2) %>%
mutate(across(.fns = prop.table, .names = '{col}_per')) %>%
ungroup
# group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per
# <dbl> <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
#1 1 A 4 5 2 3 1 0.667 0.714 0.222
#2 1 A 2 2 7 6 3 0.333 0.286 0.778
#3 2 A 8 4 3 7 7 0.889 0.364 0.429
#4 2 A 1 7 4 1 5 0.111 0.636 0.571
#5 1 B 5 6 5 2 8 0.455 0.857 0.455
#6 1 B 6 1 6 5 6 0.545 0.143 0.545
#7 2 B 7 8 8 4 2 0.7 0.727 0.889
#8 2 B 3 3 1 8 4 0.3 0.273 0.111
# … with 2 more variables: def_per <dbl>, ab23_per <dbl>
prop.table(x) is same as x/sum(x).

Using proportions
library(dplyr)
df %>%
group_by(across(starts_with('group'))) %>%
mutate(across(everything(), proportions, .names = "{col}_per")) %>%
ungroup
-ouptut
# A tibble: 8 x 12
group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per def_per ab23_per
<dbl> <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A 2 2 3 2 7 0.286 0.2 0.273 0.286 0.7
2 1 A 5 8 8 5 3 0.714 0.8 0.727 0.714 0.3
3 2 A 4 1 2 8 1 0.364 0.2 0.667 0.667 0.333
4 2 A 7 4 1 4 2 0.636 0.8 0.333 0.333 0.667
5 1 B 6 3 5 7 8 0.857 0.3 0.556 0.875 0.571
6 1 B 1 7 4 1 6 0.143 0.7 0.444 0.125 0.429
7 2 B 8 6 7 3 4 0.727 0.545 0.538 0.333 0.444
8 2 B 3 5 6 6 5 0.273 0.455 0.462 0.667 0.556

Related

Percentage of values on Mode per group in R

I have the following dataframe called df (dput below):
group value
1 A 4
2 A 2
3 A 4
4 A 3
5 A 1
6 A 5
7 B 3
8 B 2
9 B 1
10 B 2
11 B 2
12 B 2
I would like to calculate the percentage of values on the mode value per group. Here is the code to calculate the mode per group:
# Mode function
mode <- function(codes){
which.max(tabulate(codes))
}
library(dplyr)
# Calculate mode per group
df %>%
group_by(group) %>%
mutate(mode_value = mode(value))
#> # A tibble: 12 × 3
#> # Groups: group [2]
#> group value mode_value
#> <chr> <dbl> <int>
#> 1 A 4 4
#> 2 A 2 4
#> 3 A 4 4
#> 4 A 3 4
#> 5 A 1 4
#> 6 A 5 4
#> 7 B 3 2
#> 8 B 2 2
#> 9 B 1 2
#> 10 B 2 2
#> 11 B 2 2
#> 12 B 2 2
Created on 2022-11-28 with reprex v2.0.2
But I am not sure how to calculate the percentage of values on the mode per group which should look like this:
group value mode_value perc_on_mode
1 A 4 4 0.33
2 A 2 4 0.33
3 A 4 4 0.33
4 A 3 4 0.33
5 A 1 4 0.33
6 A 5 4 0.33
7 B 3 2 0.67
8 B 2 2 0.67
9 B 1 2 0.67
10 B 2 2 0.67
11 B 2 2 0.67
12 B 2 2 0.67
So I was wondering if anyone knows how to calculate the percentage of values on the mode value per group?
dput of df:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), value = c(4, 2, 4, 3, 1, 5, 3, 2, 1, 2,
2, 2)), class = "data.frame", row.names = c(NA, -12L))
You could try:
df %>%
group_by(group) %>%
mutate(mode_value = mode(value),
perc_on_mode = mean(value == mode_value))
Output:
# A tibble: 12 x 4
# Groups: group [2]
group value mode_value perc_on_mode
<chr> <dbl> <int> <dbl>
1 A 4 4 0.333
2 A 2 4 0.333
3 A 4 4 0.333
4 A 3 4 0.333
5 A 1 4 0.333
6 A 5 4 0.333
7 B 3 2 0.667
8 B 2 2 0.667
9 B 1 2 0.667
10 B 2 2 0.667
11 B 2 2 0.667
12 B 2 2 0.667
By modifying the mode function:
mode <- function(codes){
tab <- tabulate(codes)
mode_value <- which.max(tab)
data.frame(value = codes, mode_value, perc_on_mode = tab[mode_value]/length(codes))
}
# Calculate mode per group
df %>%
group_by(group) %>%
do(mode(.$value))
#> # A tibble: 12 x 4
#> # Groups: group [2]
#> group value mode_value perc_on_mode
#> <chr> <dbl> <int> <dbl>
#> 1 A 4 4 0.333
#> 2 A 2 4 0.333
#> 3 A 4 4 0.333
#> 4 A 3 4 0.333
#> 5 A 1 4 0.333
#> 6 A 5 4 0.333
#> 7 B 3 2 0.667
#> 8 B 2 2 0.667
#> 9 B 1 2 0.667
#> 10 B 2 2 0.667
#> 11 B 2 2 0.667
#> 12 B 2 2 0.667
Or with data.table:
library(data.table)
mode <- function(codes){
tab <- tabulate(codes)
mode_value <- which.max(tab)
list(mode_value, tab[mode_value]/length(codes))
}
setDT(df)[, c("mode_value", "perc_on_mode") := mode(value), group][]
#> group value mode_value perc_on_mode
#> 1: A 4 4 0.3333333
#> 2: A 2 4 0.3333333
#> 3: A 4 4 0.3333333
#> 4: A 3 4 0.3333333
#> 5: A 1 4 0.3333333
#> 6: A 5 4 0.3333333
#> 7: B 3 2 0.6666667
#> 8: B 2 2 0.6666667
#> 9: B 1 2 0.6666667
#> 10: B 2 2 0.6666667
#> 11: B 2 2 0.6666667
#> 12: B 2 2 0.6666667

Flag run-length of grouped intervals

I have a dataframe grouped by grp:
df <- data.frame(
v = rnorm(25),
grp = c(rep("A",10), rep("B",15)),
size = 2)
I want to flag the run-length of intervals determined by size. For example, for grp == "A", size is 2, and the number of rows is 10. So the interval should have length 10/2 = 5. This code, however, creates intervals with length 2:
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% size)
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 1
4 -0.913 A 2 1
5 0.486 A 2 2
6 -1.80 A 2 2
7 -0.370 A 2 3
8 -0.209 A 2 3
9 -0.661 A 2 4
10 -0.177 A 2 4
# … with 15 more rows
How can I flag the correct run-length of the size-determined intervals? The desired output is this:
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 0
4 -0.913 A 2 0
5 0.486 A 2 0
6 -1.80 A 2 1
7 -0.370 A 2 1
8 -0.209 A 2 1
9 -0.661 A 2 1
10 -0.177 A 2 1
# … with 15 more rows
If I interpreted your question correctly, this small change should do the trick?
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% (n()/size))
You can use gl:
df %>%
group_by(grp) %>%
mutate(interval = gl(first(size), ceiling(n() / first(size)))[1:n()])
output
# A tibble: 26 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <fct>
1 -1.12 A 2 1
2 3.04 A 2 1
3 0.235 A 2 1
4 -0.0333 A 2 1
5 -2.73 A 2 1
6 -0.0998 A 2 1
7 0.976 A 2 2
8 0.414 A 2 2
9 0.912 A 2 2
10 1.98 A 2 2
11 1.17 A 2 2
12 -0.509 B 2 1
13 0.704 B 2 1
14 -0.198 B 2 1
15 -0.538 B 2 1
16 -2.86 B 2 1
17 -0.790 B 2 1
18 0.488 B 2 1
19 2.17 B 2 1
20 0.501 B 2 2
21 0.620 B 2 2
22 -0.966 B 2 2
23 0.163 B 2 2
24 -2.08 B 2 2
25 0.485 B 2 2
26 0.697 B 2 2

Modify value of last element of grouping variable in another vector using dplyr

I want to modify the following dataframe such that all x at ping == 3 are NA:
Data
d <- tibble(id = rep(c(1001, 1002), each = 6),
day = rep(c(1, 1, 1, 2, 2, 2), 2),
ping = rep(1:3, 4),
x = rnorm(12, 5, 4),
y = x*0.3 + rnorm(12))
# A tibble: 12 x 5
id day ping x y
<dbl> <dbl> <int> <dbl> <dbl>
1 1001 1 1 5.63 0.783
2 1001 1 2 7.02 3.41
3 1001 1 3 1.72 1.29
4 1001 2 1 -3.00 0.154
5 1001 2 2 3.08 -0.485
6 1001 2 3 5.34 2.60
7 1002 1 1 1.42 -1.27
8 1002 1 2 1.31 -0.139
9 1002 1 3 6.32 0.524
10 1002 2 1 4.43 -0.878
11 1002 2 2 6.74 3.84
12 1002 2 3 4.79 0.782
Desired Output
# A tibble: 12 x 5
id day ping x y
<dbl> <dbl> <int> <dbl> <dbl>
1 1001 1 1 5.63 0.783
2 1001 1 2 7.02 3.41
3 1001 1 3 NA 1.29
4 1001 2 1 -3.00 0.154
5 1001 2 2 3.08 -0.485
6 1001 2 3 NA 2.60
7 1002 1 1 1.42 -1.27
8 1002 1 2 1.31 -0.139
9 1002 1 3 NA 0.524
10 1002 2 1 4.43 -0.878
11 1002 2 2 6.74 3.84
12 1002 2 3 NA 0.782
How can I do this with dplyr?
d %>%
group_by(day) %>%
mutate(...)
You can do this without grouping. I think based on your new requirements this might help:
library(dplyr)
d %>%
group_by(id, day) %>%
arrange(ping) %>%
mutate(x = ifelse(row_number() == n(), NA, x)) %>%
ungroup() %>%
arrange(id, day)
# A tibble: 12 x 5
id day ping x y
<dbl> <dbl> <int> <dbl> <dbl>
1 1001 1 1 5.19 2.54
2 1001 1 2 0.582 1.06
3 1001 1 3 NA 2.63
4 1001 2 1 7.32 3.16
5 1001 2 2 2.37 -0.104
6 1001 2 3 NA 3.65
7 1002 1 1 0.249 -0.0869
8 1002 1 2 5.61 3.62
9 1002 1 3 NA 1.92
10 1002 2 1 11.5 3.79
11 1002 2 2 5.14 1.85
12 1002 2 3 NA 2.68
for each id value you can replace the x value with NA where ping has the max value.
library(dplyr)
d %>%
group_by(id) %>%
#group by day as well if you want to consider each day within id differently.
#group_by(id, day) %>%
mutate(x = replace(x, ping == max(ping), NA)) %>%
ungroup
# id day ping x y
# <dbl> <dbl> <int> <dbl> <dbl>
# 1 1001 1 1 9.84 2.41
# 2 1001 1 2 4.37 1.54
# 3 1001 1 3 NA 2.37
# 4 1001 2 1 0.305 -0.537
# 5 1001 2 2 6.96 1.92
# 6 1001 2 3 NA 2.38
# 7 1002 1 1 5.25 1.25
# 8 1002 1 2 13.4 3.51
# 9 1002 1 3 NA 2.75
#10 1002 2 1 7.62 0.896
#11 1002 2 2 5.01 2.00
#12 1002 2 3 NA 2.96

Creating a new column using the previous value of a different column and the previous value of itself

how can I create a new column which starting value is 1 and the following values are a multiplication of the previous value of a column (b) and the previous value of itself (d)?
these data are only made up, but have the structure of my data:
> a <- rep(1:10, 3)
> b <- runif(30)
> c <- tibble(a,b)
> c
# A tibble: 30 x 2
a b
<int> <dbl>
1 1 0.945
2 2 0.280
3 3 0.464
4 4 0.245
5 5 0.917
6 6 0.913
7 7 0.144
8 8 0.481
9 9 0.873
10 10 0.754
# ... with 20 more rows
Then I try to calculate column d:
> c <- c %>%
+ group_by(a) %>%
+ mutate(d = accumulate(lag(b, k = 1), `*`, .init = 1))
and it should look like this
# A tibble: 30 x 3
# Groups: a [10]
a b d
<int> <dbl> <dbl>
1 1 0.945 1 <--- b[1] * d[1] = d[2]
2 2 0.280 0.945
3 3 0.464 0.265
4 4 0.245 0.123
5 5 0.917 0.03
#...
But instead I am getting this error message.
Fehler: Column `d` must be length 3 (the group size) or one, not 4
The problem is that when you initialize accumulate with .init = that adds an extra first element of the vector.
You could try this:
library(dplyr)
library(purrr)
c %>%
group_by(a) %>%
mutate(d = accumulate(b[(2:length(b))-1], `*`,.init=1)) %>%
arrange(a)
# a b d
# <int> <dbl> <dbl>
# 1 1 0.266 1
# 2 1 0.206 0.266
# 3 1 0.935 0.0547
# 4 2 0.372 1
# 5 2 0.177 0.372
# … with 25 more rows
Data
library(tibble)
set.seed(1)
a <- rep(1:10, 3)
b <- runif(30)
c <- tibble(a,b)
Using dplyr, I would do this:
c %>%
mutate(d = 1*accumulate(.x = b[-length(b)],
.init = 1,
.f = `*`))
# # A tibble: 30 x 3
# a b d
# <int> <dbl> <dbl>
# 1 1 0.562 1
# 2 2 0.668 0.562
# 3 3 0.100 0.375
# 4 4 0.242 0.0376
# 5 5 0.0646 0.00907
# 6 6 0.373 0.000586
# 7 7 0.664 0.000219
# 8 8 0.915 0.000145
# 9 9 0.848 0.000133
# 10 10 0.952 0.000113
# # ... with 20 more rows

Cumulative percentages in R

I have the following data frame
d2
# A tibble: 10 x 2
ID Count
<int> <dbl>
1 1
2 1
3 1
4 1
5 1
6 2
7 2
8 2
9 3
10 3
Which states how many counts each person (ID) had.
I would like to calculate the cumulative percentage of each count: 1 - 50%, up to 2: 80%, up to 3: 100%.
I tried
> d2 %>% mutate(cum = cumsum(Count)/sum(Count))
# A tibble: 10 x 3
ID Count cum
<int> <dbl> <dbl>
1 1 0.05882353
2 1 0.11764706
3 1 0.17647059
4 1 0.23529412
5 1 0.29411765
6 2 0.41176471
7 2 0.52941176
8 2 0.64705882
9 3 0.82352941
10 3 1.00000000
but this result is obviously incorrect because I would expect that the count of 1 would correspond to 50% rather than 29.4%.
What is wrong here? How do I get the correct answer?
We get the count of 'Count', create the 'Cum' by taking the cumulative sum of 'n' and divide it by the sum of 'n', then right_join with the original data
d2 %>%
count(Count) %>%
mutate(Cum = cumsum(n)/sum(n)) %>%
select(-n) %>%
right_join(d2) %>%
select(names(d2), everything())
# A tibble: 10 x 3
# ID Count Cum
# <int> <int> <dbl>
# 1 1 1 0.500
# 2 2 1 0.500
# 3 3 1 0.500
# 4 4 1 0.500
# 5 5 1 0.500
# 6 6 2 0.800
# 7 7 2 0.800
# 8 8 2 0.800
# 9 9 3 1.00
#10 10 3 1.00
If we need the output as #LAP mentioned
d2 %>%
mutate(Cum = row_number()/n())
# ID Count Cum
#1 1 1 0.1
#2 2 1 0.2
#3 3 1 0.3
#4 4 1 0.4
#5 5 1 0.5
#6 6 2 0.6
#7 7 2 0.7
#8 8 2 0.8
#9 9 3 0.9
#10 10 3 1.0
This works:
d2 %>%
mutate(cum = cumsum(rep(1/n(), n())))
ID Count cum
1 1 1 0.1
2 2 1 0.2
3 3 1 0.3
4 4 1 0.4
5 5 1 0.5
6 6 2 0.6
7 7 2 0.7
8 8 2 0.8
9 9 3 0.9
10 10 3 1.0
One option could be as:
library(dplyr)
d2 %>%
group_by(Count) %>%
summarise(proportion = n()) %>%
mutate(Perc = cumsum(100*proportion/sum(proportion))) %>%
select(-proportion)
# # A tibble: 3 x 2
# Count Perc
# <int> <dbl>
# 1 1 50.0
# 2 2 80.0
# 3 3 100.0

Resources