Get most frequently occurring factor level in dplyr piping structure - r

I'd like to be able to find the most frequently occurring level in a factor in a dataset while using dplyr's piping structure. I'm trying to create a new variable that contains the 'modal' factor level when being grouped by another variable.
This is an example of what I'm looking for:
df <- data.frame(cat = stringi::stri_rand_strings(100, 1, '[A-Z]'), num = floor(runif(100, min=0, max=500)))
df <- df %>%
dplyr::group_by(cat) %>%
dplyr::mutate(cat_mode = Mode(num))
Where "Mode" is a function that I'm looking for

Use table to count the items and then use which.max to find out the most frequent one:
df %>%
group_by(cat) %>%
mutate(cat_mode = names(which.max(table(num)))) %>%
head()
# A tibble: 6 x 3
# Groups: cat [4]
# cat num cat_mode
# <fctr> <dbl> <chr>
#1 Q 305 138
#2 W 34.0 212
#3 R 53.0 53
#4 D 395 5
#5 W 212 212
#6 Q 417 138
# ...

similar question to Is there a built-in function for finding the mode?
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df %>%
group_by(cat) %>%
mutate(cat_mode = Mode(num))
# A tibble: 100 x 3
# Groups: cat [26]
cat num cat_mode
<fct> <dbl> <dbl>
1 S 25 25
2 V 86 478
3 R 335 335
4 S 288 25
5 S 330 25
6 Q 384 384
7 C 313 313
8 H 275 275
9 K 274 274
10 J 75 75
# ... with 90 more rows
To see for each factor
df %>%
group_by(cat) %>%
summarise(cat_mode = Mode(num))
A tibble: 26 x 2
cat cat_mode
<fct> <dbl>
1 A 480
2 B 380
3 C 313
4 D 253
5 E 202
6 F 52
7 G 182
8 H 275
9 I 356
10 J 75
# ... with 16 more rows

Related

Splitting panel data rows

I have a dataset that has rows I would like to split. Is there a simple way to do this?
data = data.frame(id = 111, t1 = 277,t2 = 385, meds = 1)
I am trying to use a conditional to allow me to split rows and create an output similar to this data
data = data.frame(id = 111, t1 = c(277,366),t2 = c(365,385), meds = 1)
I think you can just do a little row-wise summary using dplyr
library(dplyr)
data %>%
rowwise() %>%
summarize(id,
t1 = if(t1 < 365 & t2 > 365) c(t1, 366) else t1,
t2 = if(t1 < 365 & t2 > 365) c(365, t2) else t2,
meds)
#> # A tibble: 2 x 4
#> id t1 t2 meds
#> <dbl> <dbl> <dbl> <dbl>
#> 1 111 277 365 1
#> 2 111 366 385 1
I used group_split function from dplyr:
## Loading the required libraries
library(dplyr)
library(tidyverse)
## Creating the dataframe
df <- data.frame(
t1= c(1:600),
t2= c(200:799)
)
## Conditional Column
df1 = df %>%
mutate(DataframeNo = ifelse(t1<365 & t2>365, "2 dfs","1 df" )) %>%
group_by(DataframeNo)
## Get the first Dataframe
group_split(df1)[[1]]
## Get the second Dataframe
group_split(df1)[[2]]
Output
> group_split(df1)[[1]]
# A tibble: 402 x 3
t1 t2 DataframeNo
<int> <int> <chr>
1 1 200 1 df
2 2 201 1 df
3 3 202 1 df
4 4 203 1 df
5 5 204 1 df
6 6 205 1 df
7 7 206 1 df
8 8 207 1 df
9 9 208 1 df
10 10 209 1 df
# ... with 392 more rows
> ## Get the second Dataframe
> group_split(df1)[[2]]
# A tibble: 198 x 3
t1 t2 DataframeNo
<int> <int> <chr>
1 167 366 2 dfs
2 168 367 2 dfs
3 169 368 2 dfs
4 170 369 2 dfs
5 171 370 2 dfs
6 172 371 2 dfs
7 173 372 2 dfs
8 174 373 2 dfs
9 175 374 2 dfs
10 176 375 2 dfs
# ... with 188 more rows

R Tibble: Arrange by two columns

Let's say I have the following data:
set.seed(123)
test <- tibble(
ID = sample(rep(1:100, rpois(100,4))),
vals = abs(round(rnorm(length(ID), 10000, 5000)))
)
I would like to sort test first by vals and then by ID with the desired output looking like this:
# A tibble: 409 x 2
ID vals
<int> <dbl>
1 48 26522
2 48 14427
3 48 7570
4 48 5922
5 92 25286
6 92 10436
7 92 5705
8 92 4036
9 92 3399
10 64 22190
# ... with 399 more rows
i.e. it should group the sorting by ID and then in decreasing order vals.
What I tried:
test %>% arrange(ID, desc(vals))
test %>% arrange(desc(vals), ID)
test %>% arrange(ID) %>% arrange(desc(vals))
I think you're missing some clarity:
sort first by each ID's maximum value, descending;
Try this:
library(dplyr)
test %>%
mutate(valrank = dense_rank(-vals)) %>%
group_by(ID) %>%
mutate(valrank = min(valrank)) %>%
ungroup() %>%
arrange(valrank, ID, desc(vals))
# # A tibble: 409 x 3
# ID vals valrank
# <int> <dbl> <int>
# 1 48 26522 1
# 2 48 14427 1
# 3 48 7570 1
# 4 48 5922 1
# 5 92 25286 2
# 6 92 10436 2
# 7 92 5705 2
# 8 92 4036 2
# 9 92 3399 2
# 10 64 22190 3
# # ... with 399 more rows
(I kept valrank just for demonstration.)

How to split a dataframe into a list of dataframes based on distinct value ranges

I want to split a dataframe into a list of dataframes based on distinct ranges of a numeric variable.
ILLUSTRATIVE DATA:
set.seed(123)
df <- data.frame(
subject = LETTERS[1:10],
weight = sample(1:1000, 10)
)
df
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
I'd like to have a list of 4 smaller dataframes based on these limits of the variable weight:
limits <- c(250, 500, 750, 1000)
That is, what I'm after, in the list of dataframes, is one dataframe where weight is in the range of 0-250, another where weight ranges between 251-500, another where the range is from 501-750, and so on--in other words, the ranges are distinct.
What I've tried so far is this dyplr solution, which outputs a list of 5 dataframes but with cumulative ranges:
limits <- c(250, 500, 750, 1000)
lapply(limits, function(x) {df %>% filter(weight <= x)})
[[1]]
[1] subject weight
<0 rows> (or 0-length row.names)
[[2]]
subject weight
1 F 46
[[3]]
subject weight
1 A 288
2 C 409
3 F 46
4 J 453
[[4]]
subject weight
1 A 288
2 C 409
3 F 46
4 G 525
5 I 548
6 J 453
[[5]]
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
How could this code be fixed, or which other code can be used, so that a list of dataframes is obtained based on distinct weight ranges?
Perhaps:
library(dplyr)
df %>%
group_split(group = findInterval(weight, limits))
Output:
[4]>
[[1]]
# A tibble: 4 x 3
subject weight group
<fct> <int> <int>
1 C 179 0
2 E 195 0
3 H 118 0
4 J 229 0
[[2]]
# A tibble: 3 x 3
subject weight group
<fct> <int> <int>
1 A 415 1
2 B 463 1
3 I 299 1
[[3]]
# A tibble: 1 x 3
subject weight group
<fct> <int> <int>
1 D 526 2
[[4]]
# A tibble: 2 x 3
subject weight group
<fct> <int> <int>
1 F 938 3
2 G 818 3
Just use keep = FALSE as additional argument to group_split if you want to remove the group column in your output.
A base R one-liner can split the data by limits.
split(df, findInterval(df$weight, limits))
#$`0`
# subject weight
#3 C 179
#5 E 195
#8 H 118
#10 J 229
#
#$`1`
# subject weight
#1 A 415
#2 B 463
#9 I 299
#
#$`2`
# subject weight
#4 D 526
#
#$`3`
# subject weight
#6 F 938
#7 G 818

Create "metadata" field in R

I have a data frame set up similar to this:
id <- c(123,234,123,234)
task <- c(54,23,12,58)
a <- c(23,67,45,89)
b <- c(78,45,65,45)
df <- data.frame(id,task,a,b)
> df
id task a b
1 123 54 23 78
2 234 23 67 45
3 123 12 45 65
4 234 58 89 45
where I score a and b for each ID:
df$score <- rowMeans(subset(df, select = c(3:4)), na.rm = TRUE)
> df
id task a b score
1 123 54 23 78 50.5
2 234 23 67 45 56.0
3 123 12 45 65 55.0
4 234 58 89 45 67.0
for each id I got an aggregate score like such:
out <- ddply(df, 1, summarise,
overall = mean(score, na.rm = TRUE))
> out
id overall
1 123 52.75
2 234 61.50
but what I want my final output to have is a new column that has the scores that went into the overall and their task id like this:
id overall meta
1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
2 234 61.50 "task_scores":[{"23":56,"58":67}]
how would I go about doing that using R?
We could make use of jsonlite to create the structure
library(jsonlite)
library(plyr)
ddply(df, "id", summarise, overall = mean(score, na.rm = TRUE),
meta = paste0('"task_scores":',
toJSON(setNames(as.data.frame.list(score), task))))
# id overall meta
#1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
#2 234 61.50 "task_scores":[{"23":56,"58":67}]
I don't know how to make that metadata dictionary offhand, but you could do something like this:
library(dplyr)
library(magrittr)
out <- df %>% group_by(id) %>% mutate(overall = mean(score))
> out
# A tibble: 4 x 6
# Groups: id [2]
id task a b score overall
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 123 54 23 78 50.5 52.8
2 234 23 67 45 56 61.5
3 123 12 45 65 55 52.8
4 234 58 89 45 67 61.5
So the df would have both the aggregated scores and preserve the data in the original rows.
You can do it with a few mutates. Paste your tallies, get your row average, then your group average.
library(dplyr)
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ","))
# A tibble: 4 x 7
# Groups: id [2]
id task a b score overall tally
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 123 54 23 78 50.5 52.8 54:50.5,12:55
2 234 23 67 45 56 61.5 23:56,58:67
3 123 12 45 65 55 52.8 54:50.5,12:55
4 234 58 89 45 67 61.5 23:56,58:67
And to get your desired final output, just select and slice.
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ",")) %>%
select(id, overall, tally) %>%
slice(1)
# A tibble: 1 x 3
id overall tally
<dbl> <dbl> <chr>
1 123 52.8 54:50.5,12:55
2 234 61.5 23:56,58:67

Differences between all possible pairs of rows for all columns within each level of factor

I want to build all possible pairs of rows in a dataframe within each level of a categorical variable name and then make the differences of these rows within each level of name for all non-factor variables: row 1 - row 2, row 1 - row 3, …
set.seed(9)
df <- data.frame(
ID = 1:10,
name = as.factor(rep(LETTERS, each = 4)[1:10]),
X1 = sample(1001, 10),
X2 = sample(1001, 10),
bool = sample(c(TRUE, FALSE), 10, replace = TRUE),
fruit = as.factor(sample(c("Apple", "Orange", "Kiwi"), 10, replace = TRUE))
)
This is what the sample looks like:
ID name X1 X2 bool fruit
1 1 A 222 118 FALSE Apple
2 2 A 25 9 TRUE Kiwi
3 3 A 207 883 TRUE Orange
4 4 A 216 301 TRUE Kiwi
5 5 B 443 492 FALSE Apple
6 6 B 134 499 FALSE Kiwi
7 7 B 389 401 TRUE Kiwi
8 8 B 368 972 TRUE Kiwi
9 9 C 665 356 FALSE Apple
10 10 C 985 488 FALSE Kiwi
I want to get a dataframe of 13 rows which looks like :
ID name X1 X2 bool fruit
1 1-2 A 197 109 -1 Apple
2 1-3 A 15 -765 -1 Kiwi
…
Note that the factor fruit should be unchanged. But it is a bonus, I want above all the X1 and X2 to be changed and the factor name to be kept.
I know I may use combn function but I do not see how to do it. I would prefer a solution with the dplyr package and the group_by function.
I've managed to create all differences for consecutives rows with dplyr using
varnotfac <- names(df)[!sapply(df, is.factor )] # remove factorial variable
# but not logical variable
library(dplyr)
diff <- df%>%
group_by(name) %>%
mutate_at(varnotfac, funs(. - lead(.))) %>% #
na.omit()
I could not find out how to keep all variables using filter_if / filter_at so I used select_at. So from #Axeman's answer
set.seed(9)
varnotfac <- names(df)[!sapply(df, is.factor )] # names of non-factorial variables
diff1<- df %>%
group_by(name) %>%
select_at(vars(varnotfac)) %>%
nest() %>%
mutate(data = purrr::map(data, ~as.data.frame(map(.x, ~combn(., 2, base::diff))))) %>%
unnest()
Or with the outer function, it's way faster than combn
set.seed(9)
varnotfac <- names(df)[!sapply(df, is.factor )] # names of non-factorial variables
allpairs <- function(v){
y <- outer(v,v,'-')
z <- y[lower.tri(y)]
return(z)
}
diff2<- df %>%
group_by(name) %>%
select_at(vars(varnotfac)) %>%
nest() %>%
mutate(data = purrr::map(data, ~as.data.frame(map(.x, ~allpairs(.))))) %>%
unnest()
)
One can check that the data.frame obtained are the same with
all.equal(diff1,diff2)
[1] TRUE
My sample looks different...
ID name X1 X2 bool
1 1 A 222 118 FALSE
2 2 A 25 9 TRUE
3 3 A 207 883 TRUE
4 4 A 216 301 TRUE
5 5 B 443 492 FALSE
6 6 B 134 499 FALSE
7 7 B 389 401 TRUE
8 8 B 368 972 TRUE
9 9 C 665 356 FALSE
10 10 C 985 488 FALSE
Using this, and looking here, we can do:
library(dplyr)
library(tidyr)
library(purrr)
df %>%
group_by(name) %>%
nest() %>%
mutate(data = map(data, ~as.data.frame(map(.x, ~as.numeric(dist(.)))))) %>%
unnest()
# A tibble: 13 x 5
name ID X1 X2 bool
<fct> <dbl> <dbl> <dbl> <dbl>
1 A 1 197 109 1
2 A 2 15 765 1
3 A 3 6 183 1
4 A 1 182 874 0
5 A 2 191 292 0
6 A 1 9 582 0
7 B 1 309 7 0
8 B 2 54 91 1
9 B 3 75 480 1
10 B 1 255 98 1
11 B 2 234 473 1
12 B 1 21 571 0
13 C 1 320 132 0
This is unsigned though. Alternatively:
df %>%
group_by(name) %>%
nest() %>%
mutate(data = map(data, ~as.data.frame(map(.x, ~combn(., 2, diff))))) %>%
unnest()
# A tibble: 13 x 5
name ID X1 X2 bool
<fct> <int> <int> <int> <int>
1 A 1 -197 -109 1
2 A 2 -15 765 1
3 A 3 -6 183 1
4 A 1 182 874 0
5 A 2 191 292 0
6 A 1 9 -582 0
7 B 1 -309 7 0
8 B 2 -54 -91 1
9 B 3 -75 480 1
10 B 1 255 -98 1
11 B 2 234 473 1
12 B 1 -21 571 0
13 C 1 320 132 0

Resources