Add grouped means to dataframe with grouped frequencies and proportions

Add grouped means to dataframe with grouped frequencies and proportions - r

I have this sort of data:
df <- data.frame(
id = sample(1:5, 100, replace = TRUE),
dur = sample(c(NA, rnorm(10)), 100, replace = TRUE),
char = sample(LETTERS, 100, replace = TRUE)
)
From this I can compute counts and proportions of the variable char:
library(dplyr)
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
char freq prop
1 C 6 8.571429
2 M 6 8.571429
3 X 5 7.142857
4 Y 5 7.142857
5 Z 5 7.142857
6 E 4 5.714286
7 I 4 5.714286
8 K 4 5.714286
9 J 3 4.285714
10 Q 3 4.285714
... clipped
Now, in df, the char values also have duration values. So I want to add another column, say mean_dur, with the mean dur values grouped by char in df. Adding on something like group_by(char) etc. to the above code doesn't work as the variable char is no longer recognized. How can that be achieved?
EDIT:
It can be done in steps, like this:
# Step 1 -- make df with counts and proportions:
df1 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
# Step 2 -- make another df with mean dur values:
df2 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur, na.rm = TRUE))
# Step 3 -- transfer mean dur values by matching `char`in `df1`and `df2`
df1$mean_dur <- df2$mean_dur[match(df1$char, df2$char)]
But is there a cleaner and tidyer dplyr way?
EDIT 2:
Thanks to #Anoushiravan R's solution, from which I picked the left_join idea, this seems like a clean and tidy solution (and it does not require the package janitor):
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")

I hope this is what you are looking for:
library(dplyr)
library(janitor)
df %>%
filter(!is.na(dur) & !id == lag(id)) %>%
tabyl(char) %>%
rename(freq = percent) %>%
mutate(freq = freq * 100) %>%
select(-n) %>%
arrange(desc(freq)) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")
char freq mean_dur
T 7.894737 -0.4861708
Z 7.894737 -0.2867046
A 6.578947 -0.5056797
B 5.263158 0.3513478
E 5.263158 0.5113139
K 5.263158 -1.4560764
L 5.263158 0.8235192
N 5.263158 0.9037481
X 5.263158 -1.4669529
C 3.947368 -0.4064762
I 3.947368 -0.7722133
P 3.947368 -0.1076928
U 3.947368 0.5573875
Y 3.947368 0.2404896
D 2.631579 0.5942473
F 2.631579 1.2381883
G 2.631579 -0.2155605
J 2.631579 1.0528329
M 2.631579 -1.5482806
O 2.631579 0.2813264
S 2.631579 1.2132490
V 2.631579 0.6157874
H 1.315789 -1.2664754
Q 1.315789 1.1027114
R 1.315789 0.1288634
W 1.315789 1.0528329

If you're prepared to give up prop.table, then I think this gives you what you want...
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(
n=n(),
prop = 100*n/nrow(.),
mean_dur=mean(dur, na.rm=TRUE),
.groups="drop"
)
# A tibble: 25 x 4
char n prop mean_dur
* <fct> <int> <dbl> <dbl>
1 A 6 8.82 0.158
2 B 5 7.35 -0.144
3 C 2 2.94 0.951
4 D 2 2.94 0.518
5 E 5 7.35 0.211
6 F 3 4.41 0.333
7 G 2 2.94 0.951
8 H 3 4.41 0.624
9 I 2 2.94 -0.422
10 J 2 2.94 -0.347
# … with 15 more rows
[It took me a while to notice you were working with random data. set.seed() would have been helpful! ;=) ]
Edited in line with comment below

Another option:
mean_dur <- df %>% group_by(char) %>% summarise(mean_dur=mean(dur,na.rm=T))
tab <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq') %>%
mutate(prop = prop.table(freq) * 100)
tab <- merge.data.frame(tab,mean_dur)
tab <- tab[order(tab$freq,decreasing = T),]
char freq prop mean_dur
17 R 6 8.108108 -0.75610907
3 D 5 6.756757 -0.61657511
5 F 5 6.756757 -0.34153689
10 K 5 6.756757 -0.90688768
19 T 5 6.756757 0.33628707
6 G 4 5.405405 -0.93390134
9 J 4 5.405405 0.27471673
11 L 4 5.405405 0.87029782
13 N 4 5.405405 0.17163797
16 Q 4 5.405405 -0.67554378
22 X 4 5.405405 -0.42108346
7 H 3 4.054054 0.36290234
14 O 3 4.054054 -0.56712470
15 P 3 4.054054 0.08316665
2 C 2 2.702703 -1.15398142
4 E 2 2.702703 -0.31271923
12 M 2 2.702703 -0.96001502
18 S 2 2.702703 -0.88921047
20 U 2 2.702703 0.24299241
21 W 2 2.702703 -1.32772406
1 A 1 1.351351 0.24299241
8 I 1 1.351351 -1.07336407
23 Z 1 1.351351 -1.07336407

Related

Mutate a dynamic subset of variable

a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)
I want a column l that averages the subset of x for the values associated to a t < t(x).
Expected result:
for x[t=0], l = NaN
for x[t=1], l = mean(x[t<1])
for x[t=2], l = mean(x[t<2])
etc.
A code that does not work:
a %>%
mutate(
l = mean(x[a$t < .$t])
) -> a
Now this could would work:
for (i in c(1:1000)) {
a$l[i] = mean(a$x[a$t < a$t[i]])
}
But is not a mutate. I'd like a mutate so I can apply it to groups etc.
To understand better the issue: imagine that you have to average all the x before a date. Now: this, dynamically, in a mutate.
I think that purrr may be necessary but I hate it.

You can use map with mutate:
library(tidyverse)
f <- function(lim) mean(a$x[a$t < lim])
a %>% mutate(l = map_dbl(t, f))
Testing against OP solution:
res <- a %>% mutate(l = map_dbl(t, f))
l <- vector(mode = "numeric", length = 1000)
for (i in c(1:1000)) l[i] = mean(a$x[a$t < a$t[i]])
assertthat::are_equal(res$l, l) # TRUE

For each t value you can calculate average value of x and then calculate lag value of cumulative mean.
library(dplyr)
a %>%
group_by(t) %>%
summarise(l = mean(x)) %>%
mutate(l = lag(cummean(l)))
# t l
# <int> <dbl>
# 1 0 NA
# 2 1 5.33
# 3 2 5.45
# 4 3 5.36
# 5 4 5.26
# 6 5 5.16
# 7 6 5.10
# 8 7 5.07
# 9 8 5.12
#10 9 4.96
#11 10 4.98
#12 11 5.15
#13 12 4.93
If you want to maintain number of rows in the dataframe add %>% left_join(a, by = 't') to the above answer.
data
set.seed(123)
a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)

Set collapse for paste function in aggregate

I want to aggregate a data.frame with two columns: in one column I have "num", which is an identifier number and in the other I have text. It is important that the aggregated text has a space between the individual parts. My code is this:
data_aggr <- aggregate(
x = data_aggr,
FUN = paste,
by = list(data_aggr$num)
)
I have tried the obvious with FUN = paste(collapse = " ") and
FUN = paste,
collapse = " ",
but that doesn't work. How do I need to do this?

Aggregate can be used to paste together the rows with the same value of num as follows:
data_aggr <- data.frame(num=c(1,1,1,2,2), letters=letters[1:5])
aggregate(data_aggr$letters, list(data_aggr$num), FUN=paste, collapse= " ")
# Group.1 x
# 1 1 a b c
# 2 2 d e

A dplyr solution, the idea is to create a new column with row number to be able to conduct the operation on each row.
> library(dplyr)
> df.ask <- data.frame('Num' = 1:10,
+ 'Text' = letters[1:10])
>
> df.ask %>%
+ mutate(row_num = row_number()) %>%
+ group_by(row_num) %>%
+ mutate(together = paste(Num, Text, collapse = ' ')) %>%
+ ungroup() %>%
+ select(-row_num)
# A tibble: 10 x 3
Num Text together
<int> <fct> <chr>
1 1 a 1 a
2 2 b 2 b
3 3 c 3 c
4 4 d 4 d
5 5 e 5 e
6 6 f 6 f
7 7 g 7 g
8 8 h 8 h
9 9 i 9 i
10 10 j 10 j

How to multiply by a constant and then sum across rows

I am having an issue multiplying 3 columns by 3 different constants (i.e, 2,3,4, respectively) and then summing each row after applying the conversion.
I am using dplyr
variable <- df %>% transmute(df, sum(col1, col2*2, col3*3, col4*4))

We could do
library(dplyr)
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4,
total = a + b + c)
# a b c total
#1 2 18 44 64
#2 4 21 48 73
#3 6 24 52 82
#4 8 27 56 91
#5 10 30 60 100
Using rowSums
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4) %>%
mutate(total = rowSums(.))
Important to note that if we are using rowSums, we need to include it in the new mutate call and not the same one otherwise it would sum the original df and not the changed one.
Or in base R
df1 <- transform(df, a = a*2, b = b * 3, c = c *4)
df1$total <- rowSums(df1)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)

In base R, we can do this more compactly with %*%
df$total <- c(as.matrix(df) %*% 2:4)
df
# a b c total
#1 1 6 11 64
#2 2 7 12 73
#3 3 8 13 82
#4 4 9 14 91
#5 5 10 15 100
Or with crossprod
df$total <- c(crossprod(t(df), 2:4))
--
Or with tidyverse
library(tidyverse)
map2(df, 2:4, ~ .x * .y) %>%
reduce(`+`) %>%
bind_cols(df, total = .)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)

variable <- df %>%
rowwise() %>%
mutate(new_var = sum(col1, col2*2, col3*3, col4*4))
Try that instead.
add rowwise() to have data analyzed at each row
use mutate() to get the new calculation

Creating Nodes and Edges Dataframes from Tidy Dataframes

I have a data frame that's of this structure:
df <- data.frame(var1 = c(1,1,1,2,2,3,3,3,3),
cat1 = c("A","B","D","B","C","D","E","B","A"))`
> df
var1 cat1
1 1 A
2 1 B
3 1 D
4 2 B
5 2 C
6 3 D
7 3 E
8 3 B
9 3 A
And I am looking to create both nodes and edges data frames from it, so that I can draw a network graph, using VisNetwork. This network will show the number/strength of connections between the different cat1 values, as grouped by the var1 value.
I have the nodes data frame sorted:
nodes <- data.frame(id = unique(df$cat1))
> nodes
id
1 A
2 B
3 D
4 C
5 E
What I'd like help with is how to process df in the following manner:
for each distinct value of var1 in df, tally up the group of nodes that are common to that value of var1 to give an edges dataframe that ultimately looks like the one below. Note that I'm not bothered about the direction of flow along the edges. Just that they are connected is all I need.
> edges
from to value
1 A B 2
2 A D 2
3 A E 1
4 B C 1
5 B D 2
6 B E 1
7 D E 1
With thanks in anticipation,
Nevil
Update: I found here a similar problem, and have adapted that code to give, which is getting close to what I want, but not quite there...
> df %>% group_by(var1) %>%
filter(n()>=2) %>% group_by(var1) %>%
do(data.frame(t(combn(.$cat1, 2,function(x) sort(x))),
stringsAsFactors=FALSE))
# A tibble: 10 x 3
# Groups: var1 [3]
var1 X1 X2
<dbl> <chr> <chr>
1 1. A B
2 1. A D
3 1. B D
4 2. B C
5 3. D E
6 3. B D
7 3. A D
8 3. B E
9 3. A E
10 3. A B

I don't know if there is already a suitable function to achieve this task. Here is a detailed procedure to do it. Whith this, you should be able to define you own function. Hope it helps!
# create an adjacency matrix
mat <- table(df)
mat <- t(mat) %*% mat
as.table(mat) # look at your adjacency matrix
# since the network is not directed, we can consider only the (strictly) upper triangular matrix
mat[lower.tri(mat, diag = TRUE)] <- 0
as.table(mat) # look at the new adjacency matrix
library(dplyr)
edges <- as.data.frame(as.table(mat))
edges <- filter(edges, Freq != 0)
colnames(edges) <- c("from", "to", "value")
edges <- arrange(edges, from)
edges # output
# from to value
#1 A B 2
#2 A D 2
#3 A E 1
#4 B C 1
#5 B D 2
#6 B E 1
#7 D E 1

here's a couple other ways...
in base R...
values <- unique(df$var1[duplicated(df$var1)])
do.call(rbind,
lapply(values, function(i) {
nodes <- as.character(df$cat1[df$var1 == i])
edges <- combn(nodes, 2)
data.frame(from = edges[1, ],
to = edges[2, ],
value = i,
stringsAsFactors = F)
})
)
in tidyverse...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
filter(n() >= 2) %>%
mutate(cat1 = as.character(cat1)) %>%
summarise(edges = list(data.frame(t(combn(cat1, 2)), stringsAsFactors = F))) %>%
unnest(edges) %>%
select(from = X1, to = X2, value = var1)
in tidyverse using tidyr::complete...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
mutate(i.cat1 = cat1) %>%
complete(cat1, i.cat1) %>%
filter(cat1 < i.cat1) %>%
select(from = cat1, to = i.cat1, value = var1)
in tidyverse using tidyr::expand...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
expand(cat1, to = cat1) %>%
filter(cat1 < to) %>%
select(from = cat1, to, value = var1)

How can I replace a factor levels with the top n levels (by number of occurances)

This question is related to How can I replace a factor levels with the top n levels (by some metric), plus [other]?. As a metric I want to use the number of occurrences of the factor. I know I can do it by making a list of the occurrences, but I was wondering if there is a prettier way.
Example:
library(data.table);
library(plyr);
fac <- data.table(score = as.factor(c(3,4,5,3,3,3,5)));
ocCnt <- data.table(lapply(fac,count)$score);
fac$occurrence <- 0;
for(i in 1:length(fac$score)){fac$occurrence[i]<-ocCnt[x==fac$score[i]]$freq};
Then I could use the function described in the referenced question/answer:
hotfactor= function(fac,by,n=10,o="other") {
levels(fac)[rank(-xtabs(by~fac))[levels(fac)]>n] <- o
fac
}
To continue the example, if we want only to see the most popular factor we do:
hotfactor(fac$score,fac$occurrence,1);
To get the answer:
[1] 3 other other 3 3 3 other
Levels: 3 other
So my question is, can I do this without having to add a list which counts the occurrences?
Note that I want to do this for the n most popular factors (not just for the most popular factor).

Use table and which.max:
score <- factor(c(3,4,5,3,3,3,5))
levels(score)[- which.max(table(score))] <- "other"
#[1] 3 other other 3 3 3 other
#Levels: 3 other
Obviously this breaks ties by taking the first maximum value.
If you want to keep the top two levels:
score <- factor(c(3, 4,5,3,3,3,5), levels =c(4,3,5))
levels(score)[!levels(score) %in% names(sort(table(score), decreasing = TRUE)[1:2])] <- "other"
#[1] 3 other 5 3 3 3 5
#Levels: other 3 5

If you don't know how many levels you need to group say, 90% of your data and are willing to use dplyr, you could do something along the following lines:
library(dplyr)
df <- data.frame(
f = factor(mapply(rep, letters[1:5], 2^(1:5)) %>% unlist(use.names = F))
)
df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df))
# A tibble: 5 x 3
# f n p
# <fctr> <int> <dbl>
# 1 e 32 0.5161290
# 2 d 16 0.7741935
# 3 c 8 0.9032258
# 4 b 4 0.9677419
# 5 a 2 1.0000000
(top <- df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df)) %>%
filter(cumall(p < .91)) %>%
select(f) %>%
unlist(use.names = F))
# [1] e d c
# Levels: a b c d e
levels(df$f) <- factor(c(levels(df$f), 'z'))
df$f[!df$f %in% top] <- 'z'
df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df))
# A tibble: 4 x 3
# f n p
# <fctr> <int> <dbl>
# 1 e 32 0.5161290
# 2 d 16 0.7741935
# 3 c 8 0.9032258
# 4 z 6 1.0000000

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Add grouped means to dataframe with grouped frequencies and proportions - r

Related

Mutate a dynamic subset of variable

Set collapse for paste function in aggregate

How to multiply by a constant and then sum across rows

Creating Nodes and Edges Dataframes from Tidy Dataframes

How can I replace a factor levels with the top n levels (by number of occurances)

Categories

Resources