I have a dataframe grouped by grp:
df <- data.frame(
v = rnorm(25),
grp = c(rep("A",10), rep("B",15)),
size = 2)
I want to flag the run-length of intervals determined by size. For example, for grp == "A", size is 2, and the number of rows is 10. So the interval should have length 10/2 = 5. This code, however, creates intervals with length 2:
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% size)
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 1
4 -0.913 A 2 1
5 0.486 A 2 2
6 -1.80 A 2 2
7 -0.370 A 2 3
8 -0.209 A 2 3
9 -0.661 A 2 4
10 -0.177 A 2 4
# … with 15 more rows
How can I flag the correct run-length of the size-determined intervals? The desired output is this:
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 0
4 -0.913 A 2 0
5 0.486 A 2 0
6 -1.80 A 2 1
7 -0.370 A 2 1
8 -0.209 A 2 1
9 -0.661 A 2 1
10 -0.177 A 2 1
# … with 15 more rows
If I interpreted your question correctly, this small change should do the trick?
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% (n()/size))
You can use gl:
df %>%
group_by(grp) %>%
mutate(interval = gl(first(size), ceiling(n() / first(size)))[1:n()])
output
# A tibble: 26 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <fct>
1 -1.12 A 2 1
2 3.04 A 2 1
3 0.235 A 2 1
4 -0.0333 A 2 1
5 -2.73 A 2 1
6 -0.0998 A 2 1
7 0.976 A 2 2
8 0.414 A 2 2
9 0.912 A 2 2
10 1.98 A 2 2
11 1.17 A 2 2
12 -0.509 B 2 1
13 0.704 B 2 1
14 -0.198 B 2 1
15 -0.538 B 2 1
16 -2.86 B 2 1
17 -0.790 B 2 1
18 0.488 B 2 1
19 2.17 B 2 1
20 0.501 B 2 2
21 0.620 B 2 2
22 -0.966 B 2 2
23 0.163 B 2 2
24 -2.08 B 2 2
25 0.485 B 2 2
26 0.697 B 2 2
Related
I have the following data:
library(dplyr)
group_1 <- c(1,1,2,2,1,1,2,2)
group_2 <- c("A","A","A","A","B","B","B","B")
val <- c(sample(8))
xyz <- c(sample(8))
abc <- c(sample(8))
def <- c(sample(8))
ab23 <- c(sample(8))
df <- data.frame(group_1,group_2,val,xyz,abc,def,ab23)
df <- df %>% group_by(group_1,group_2) %>%
mutate(val_per = val/sum(val,na.rm = TRUE),
xyz_per = xyz/sum(xyz,na.rm = TRUE),
abc_per = abc/sum(abc,na.rm = TRUE),
def_per = def/sum(def,na.rm = TRUE),
ab23_per = ab23/sum(ab23,na.rm = TRUE))
I don't want to mutate new columns for creating percentages for each column. Is there a way in which new columns are create which have the percentage for each column.
We can also do this:
library(dplyr)
library(purrr)
df %>%
bind_cols(df %>%
select(!starts_with("group")) %>%
map_dfc(~ .x / sum(.x)) %>%
set_names(paste(names(.), "_per", sep = "")))
group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per def_per ab23_per
1 1 A 3 4 1 4 5 0.08333333 0.11111111 0.02777778 0.11111111 0.13888889
2 1 A 2 2 6 8 2 0.05555556 0.05555556 0.16666667 0.22222222 0.05555556
3 2 A 8 8 7 3 3 0.22222222 0.22222222 0.19444444 0.08333333 0.08333333
4 2 A 5 7 8 5 6 0.13888889 0.19444444 0.22222222 0.13888889 0.16666667
5 1 B 6 5 4 2 4 0.16666667 0.13888889 0.11111111 0.05555556 0.11111111
6 1 B 4 1 5 7 8 0.11111111 0.02777778 0.13888889 0.19444444 0.22222222
7 2 B 7 6 2 6 7 0.19444444 0.16666667 0.05555556 0.16666667 0.19444444
8 2 B 1 3 3 1 1 0.02777778 0.08333333 0.08333333 0.02777778 0.02777778
You can do this with across -
library(dplyr)
df %>%
group_by(group_1,group_2) %>%
mutate(across(.fns = prop.table, .names = '{col}_per')) %>%
ungroup
# group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per
# <dbl> <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
#1 1 A 4 5 2 3 1 0.667 0.714 0.222
#2 1 A 2 2 7 6 3 0.333 0.286 0.778
#3 2 A 8 4 3 7 7 0.889 0.364 0.429
#4 2 A 1 7 4 1 5 0.111 0.636 0.571
#5 1 B 5 6 5 2 8 0.455 0.857 0.455
#6 1 B 6 1 6 5 6 0.545 0.143 0.545
#7 2 B 7 8 8 4 2 0.7 0.727 0.889
#8 2 B 3 3 1 8 4 0.3 0.273 0.111
# … with 2 more variables: def_per <dbl>, ab23_per <dbl>
prop.table(x) is same as x/sum(x).
Using proportions
library(dplyr)
df %>%
group_by(across(starts_with('group'))) %>%
mutate(across(everything(), proportions, .names = "{col}_per")) %>%
ungroup
-ouptut
# A tibble: 8 x 12
group_1 group_2 val xyz abc def ab23 val_per xyz_per abc_per def_per ab23_per
<dbl> <chr> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A 2 2 3 2 7 0.286 0.2 0.273 0.286 0.7
2 1 A 5 8 8 5 3 0.714 0.8 0.727 0.714 0.3
3 2 A 4 1 2 8 1 0.364 0.2 0.667 0.667 0.333
4 2 A 7 4 1 4 2 0.636 0.8 0.333 0.333 0.667
5 1 B 6 3 5 7 8 0.857 0.3 0.556 0.875 0.571
6 1 B 1 7 4 1 6 0.143 0.7 0.444 0.125 0.429
7 2 B 8 6 7 3 4 0.727 0.545 0.538 0.333 0.444
8 2 B 3 5 6 6 5 0.273 0.455 0.462 0.667 0.556
I observe 12 responses of 2 survey participants.
data = data.frame(id = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2), response = c(2,2,3,3,6,3,6,7,3,1,4,3,3,3,6,4,2,6,7,3,2,1,5,6))
data
id response
1 1 2
2 1 2
3 1 3
4 1 3
5 1 6
6 1 3
7 1 6
8 1 7
9 1 3
10 1 1
11 1 4
12 1 3
13 2 3
14 2 3
15 2 6
16 2 4
17 2 2
18 2 6
19 2 7
20 2 3
21 2 2
22 2 1
23 2 5
24 2 6
Now I want to add 2 things to the data of each survey participant:
a) The most frequent value of this survey participant
b) the relative frequency of the most frequent value
How can I add these things using dplyr:
data %>%
group_by(id) %>%
mutate(most_frequent_value = ?,
relative_frequency_of_most_frequent_value = ?)
I'd probably use a two step solution. First, create a data.frame of frequency/relative frequency. Then join to it. We use slice(which.max()), because it will return one row. Using slice_max may return multiple rows.
library(tidyverse)
# count by id, response, calculate rel frequency
# rename columns to make inner_join easier
freq_table <- dd %>%
count(id, response) %>%
group_by(id) %>%
mutate(rel_freq = n / sum(n)) %>%
select(id, most_frequent_response = response, rel_freq)
# inner join to sliced freq_table (grouping by id is preserved)
dd %>%
inner_join(freq_table %>% slice(which.max(rel_freq)))
# id response most_frequent_response rel_freq
# 1 1 2 3 0.4166667
# 2 1 2 3 0.4166667
# 3 1 3 3 0.4166667
# 4 1 3 3 0.4166667
# 5 1 6 3 0.4166667
# ...
You could try:
table(data$id, data$response) %>%
as.data.frame() %>%
setNames(c("id", "response", "n")) %>%
group_by(id) %>%
slice_max(n, 1) %>%
group_by(response) %>%
filter(n() > 1) %>%
mutate(ratio = c(n[1]/sum(n), n[2]/sum(n)))
#> # A tibble: 2 x 4
#> # Groups: response [1]
#> id response n ratio
#> <fct> <fct> <int> <dbl>
#> 1 1 3 5 0.625
#> 2 2 3 3 0.375
Does this work:
data %>% group_by(id, response) %>% mutate(n = n()) %>%
ungroup() %>% group_by(id) %>%
mutate(most_frequent_value = response[n == max(n)][1],
relative_frequency_of_most_frequent_value = max(n)/n())
# A tibble: 24 x 5
# Groups: id [2]
id response n most_frequent_value relative_frequency_of_most_frequent_value
<dbl> <dbl> <int> <dbl> <dbl>
1 1 2 2 3 0.417
2 1 2 2 3 0.417
3 1 3 5 3 0.417
4 1 3 5 3 0.417
5 1 6 2 3 0.417
6 1 3 5 3 0.417
7 1 6 2 3 0.417
8 1 7 1 3 0.417
9 1 3 5 3 0.417
10 1 1 1 3 0.417
11 1 4 1 3 0.417
12 1 3 5 3 0.417
13 2 3 3 3 0.25
14 2 3 3 3 0.25
15 2 6 3 3 0.25
16 2 4 1 3 0.25
17 2 2 2 3 0.25
18 2 6 3 3 0.25
19 2 7 1 3 0.25
20 2 3 3 3 0.25
21 2 2 2 3 0.25
22 2 1 1 3 0.25
23 2 5 1 3 0.25
24 2 6 3 3 0.25
>
I want to automatically add one or more rows to an existing tibble depending on the values present in one of the tibble columns.
Data
A B C D E
1 1 1 1 5 7.81
2 1 1 1 4 13.12
3 1 1 1 5 3.39
4 1 1 1 4 3.28
5 1 1 1 5 2.69
6 1 1 1 2 5.70
7 1 1 1 1 8.22
Expected Output if '3' is missing from 'D'
A B C D E
1 1 1 1 5 7.81
2 1 1 1 4 13.12
3 1 1 1 5 3.39
4 1 1 1 4 3.28
5 1 1 1 5 2.69
6 1 1 1 2 5.70
7 1 1 1 1 8.22
8 1 1 1 3 0.00
In the tibble column D the values should range from 1:5 depending on the data set.
I want to be able to identify whether one or more numbers from this range are missing from column D, which if one value (e.g. 3) is missing I want to add a new row that copies the data in columns A : C and enters 3 in column D and 0 in column E. If two or more values are missing (e.g. 3 and 4) I want to add two rows, etc.
You can use tidyr::complete:
library(tidyverse)
Data %>%
complete(nesting(A,B,C), D = seq(min(D), max(D), 1L))
#> # A tibble: 8 x 5
#> A B C D E
#> <int> <int> <int> <int> <dbl>
#> 1 1 1 1 1 8.22
#> 2 1 1 1 2 5.7
#> 3 1 1 1 3 NA
#> 4 1 1 1 4 13.1
#> 5 1 1 1 4 3.28
#> 6 1 1 1 5 7.81
#> 7 1 1 1 5 3.39
#> 8 1 1 1 5 2.69
I'd recommend not using 0 instead of NA but if you want to add them at the bottom and have them as 0 then this works:
Data %>%
complete(nesting(A,B,C), D = seq(min(D), max(D), 1L)) %>%
arrange(is.na(E)) %>%
mutate(E = replace_na(E, 0))
#> # A tibble: 8 x 5
#> A B C D E
#> <int> <int> <int> <int> <dbl>
#> 1 1 1 1 1 8.22
#> 2 1 1 1 2 5.7
#> 3 1 1 1 4 13.1
#> 4 1 1 1 4 3.28
#> 5 1 1 1 5 7.81
#> 6 1 1 1 5 3.39
#> 7 1 1 1 5 2.69
#> 8 1 1 1 3 0
Created on 2019-06-20 by the reprex package (v0.3.0)
I came across a problem that forced me to use a loop instead of my preferred dplyr pipe flow.
I want to group rows based on consecutive observations of the same value.
For example, if the first four observations of type equal a, the first four observations should assigned to the same group. Order matters, so I can't dplyr::group_by and dplyr::summarize.
The code below should explain the problem fairly well. I was wondering if anyone could propose a less verbose way to do this, preferably using tidyverse packages, and not data.tables.
library(tidyverse)
# Crete some test data
df <- tibble(
id = 1:20,
type = c(rep("a", 5), rep("b", 5), rep("a", 5), rep("b", 5)),
val = runif(20)
)
df
#> # A tibble: 20 x 3
#> id type val
#> <int> <chr> <dbl>
#> 1 1 a 0.0606
#> 2 2 a 0.501
#> 3 3 a 0.974
#> 4 4 a 0.0833
#> 5 5 a 0.752
#> 6 6 b 0.0450
#> 7 7 b 0.367
#> 8 8 b 0.649
#> 9 9 b 0.846
#> 10 10 b 0.896
#> 11 11 a 0.178
#> 12 12 a 0.295
#> 13 13 a 0.206
#> 14 14 a 0.233
#> 15 15 a 0.851
#> 16 16 b 0.179
#> 17 17 b 0.801
#> 18 18 b 0.326
#> 19 19 b 0.269
#> 20 20 b 0.584
# Solve problem with a loop
count <- 1
df$consec_group <- NA
for (i in 1:nrow(df)) {
current <- df$type[i]
lag <- ifelse(i == 1, NA, df$type[i - 1])
lead <- ifelse(i == nrow(df), NA, df$type[i + 1])
if (lead %>% is.na) {
df$consec_group[i] <- ifelse(current == lag, count, count + 1)
} else {
df$consec_group[i] <- count
if (current != lead) count <- count + 1
}
}
df
#> # A tibble: 20 x 4
#> id type val consec_group
#> <int> <chr> <dbl> <dbl>
#> 1 1 a 0.0606 1
#> 2 2 a 0.501 1
#> 3 3 a 0.974 1
#> 4 4 a 0.0833 1
#> 5 5 a 0.752 1
#> 6 6 b 0.0450 2
#> 7 7 b 0.367 2
#> 8 8 b 0.649 2
#> 9 9 b 0.846 2
#> 10 10 b 0.896 2
#> 11 11 a 0.178 3
#> 12 12 a 0.295 3
#> 13 13 a 0.206 3
#> 14 14 a 0.233 3
#> 15 15 a 0.851 3
#> 16 16 b 0.179 4
#> 17 17 b 0.801 4
#> 18 18 b 0.326 4
#> 19 19 b 0.269 4
#> 20 20 b 0.584 4
Created on 2019-03-14 by the reprex package (v0.2.1)
This grouping of consecutive type occurrences is really just an intermediate step. My endgame is manipulate val for a given consec_group, based on the values of val that occurred within the previous consec_group. Advice on relevant packages would be appreciated.
You say "no data.tables", but are you sure? It's so *** fast and easy (in this case)...
library(data.table)
setDT(df)[, groupid := rleid(type)][]
# id type val groupid
# 1: 1 a 0.624078793 1
# 2: 2 a 0.687361541 1
# 3: 3 a 0.817702740 1
# 4: 4 a 0.669857208 1
# 5: 5 a 0.100977936 1
# 6: 6 b 0.418275823 2
# 7: 7 b 0.660119857 2
# 8: 8 b 0.876015209 2
# 9: 9 b 0.473562143 2
# 10: 10 b 0.284474633 2
# 11: 11 a 0.034154862 3
# 12: 12 a 0.391760387 3
# 13: 13 a 0.383107868 3
# 14: 14 a 0.729583433 3
# 15: 15 a 0.006288375 3
# 16: 16 b 0.530179235 4
# 17: 17 b 0.802643704 4
# 18: 18 b 0.409618633 4
# 19: 19 b 0.309363642 4
# 20: 20 b 0.021918512 4
If you insist on using the tidyverse/dplyr, you can (of course) still use the
rleid-function as follows:
df %>% mutate( groupid = data.table::rleid(type) )
benchmarks
on a larger sample
library(tidyverse)
library(data.table)
# Crete some large test data
df <- tibble(
id = 1:200000,
type = sample(letters[1:26], 200000, replace = TRUE),
val = runif(200000)
)
dt <- as.data.table(df)
microbenchmark::microbenchmark(
dplyr.rleid = df %>% mutate( groupid = data.table::rleid(type) ),
data.table.rleid = dt[, groupid := rleid(type)][],
rle = df %>% mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)}),
rle2 = df %>% mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths))),
transform = transform(df, ID = with(rle(df$type), rep(seq_along(lengths), lengths))),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# dplyr.rleid 3.153626 3.278049 3.410363 3.444949 3.502792 3.582626 10
# data.table.rleid 2.965639 3.065959 3.173992 3.145643 3.259672 3.507009 10
# rle 13.059774 14.042797 24.364176 26.126176 29.460561 36.874054 10
# rle2 12.641319 13.553846 30.951152 24.698338 34.139786 102.791719 10
# transform 12.330717 22.419128 22.725242 25.532084 26.187634 26.702794 10
You can use a rleid()-like possibility like this:
df %>%
mutate(ID_rleid = {ID_rleid = rle(type); rep(seq_along(ID_rleid$lengths), ID_rleid$lengths)})
id type val ID_rleid
<int> <chr> <dbl> <int>
1 1 a 0.0430 1
2 2 a 0.858 1
3 3 a 0.504 1
4 4 a 0.318 1
5 5 a 0.469 1
6 6 b 0.144 2
7 7 b 0.173 2
8 8 b 0.0706 2
9 9 b 0.958 2
10 10 b 0.557 2
11 11 a 0.358 3
12 12 a 0.973 3
13 13 a 0.982 3
14 14 a 0.177 3
15 15 a 0.599 3
16 16 b 0.627 4
17 17 b 0.454 4
18 18 b 0.682 4
19 19 b 0.690 4
20 20 b 0.713 4
Or a modification (originally proposed by #d.b) that makes it more handy:
df %>%
mutate(ID_rleid = with(rle(type), rep(seq_along(lengths), lengths)))
I have the following data frame
d2
# A tibble: 10 x 2
ID Count
<int> <dbl>
1 1
2 1
3 1
4 1
5 1
6 2
7 2
8 2
9 3
10 3
Which states how many counts each person (ID) had.
I would like to calculate the cumulative percentage of each count: 1 - 50%, up to 2: 80%, up to 3: 100%.
I tried
> d2 %>% mutate(cum = cumsum(Count)/sum(Count))
# A tibble: 10 x 3
ID Count cum
<int> <dbl> <dbl>
1 1 0.05882353
2 1 0.11764706
3 1 0.17647059
4 1 0.23529412
5 1 0.29411765
6 2 0.41176471
7 2 0.52941176
8 2 0.64705882
9 3 0.82352941
10 3 1.00000000
but this result is obviously incorrect because I would expect that the count of 1 would correspond to 50% rather than 29.4%.
What is wrong here? How do I get the correct answer?
We get the count of 'Count', create the 'Cum' by taking the cumulative sum of 'n' and divide it by the sum of 'n', then right_join with the original data
d2 %>%
count(Count) %>%
mutate(Cum = cumsum(n)/sum(n)) %>%
select(-n) %>%
right_join(d2) %>%
select(names(d2), everything())
# A tibble: 10 x 3
# ID Count Cum
# <int> <int> <dbl>
# 1 1 1 0.500
# 2 2 1 0.500
# 3 3 1 0.500
# 4 4 1 0.500
# 5 5 1 0.500
# 6 6 2 0.800
# 7 7 2 0.800
# 8 8 2 0.800
# 9 9 3 1.00
#10 10 3 1.00
If we need the output as #LAP mentioned
d2 %>%
mutate(Cum = row_number()/n())
# ID Count Cum
#1 1 1 0.1
#2 2 1 0.2
#3 3 1 0.3
#4 4 1 0.4
#5 5 1 0.5
#6 6 2 0.6
#7 7 2 0.7
#8 8 2 0.8
#9 9 3 0.9
#10 10 3 1.0
This works:
d2 %>%
mutate(cum = cumsum(rep(1/n(), n())))
ID Count cum
1 1 1 0.1
2 2 1 0.2
3 3 1 0.3
4 4 1 0.4
5 5 1 0.5
6 6 2 0.6
7 7 2 0.7
8 8 2 0.8
9 9 3 0.9
10 10 3 1.0
One option could be as:
library(dplyr)
d2 %>%
group_by(Count) %>%
summarise(proportion = n()) %>%
mutate(Perc = cumsum(100*proportion/sum(proportion))) %>%
select(-proportion)
# # A tibble: 3 x 2
# Count Perc
# <int> <dbl>
# 1 1 50.0
# 2 2 80.0
# 3 3 100.0