I am trying to incorporate the drop = F into the following dplyr function
dspreadN = function(data, ...) {
data %>% group_by_(.dots = lazyeval::lazy_dots(...), .drop = F) %>%
summarise(n = n()*100) %>% spread(value, n, fill = 0)
}
Basically, the function transform this
id x
1 1 A
2 1 A
3 1 A
4 1 A
5 2 A
6 2 A
7 2 B
8 2 B
9 3 A
10 3 A
11 3 B
12 3 A
into that
id drop A B
<dbl> <lgl> <dbl> <dbl>
1 1 FALSE 400 0
2 2 FALSE 200 200
3 3 FALSE 300 100
I use the function in this way dff %>% dspreadN(id, value = x)
(my real example is much more complicated that why I need the dplyr function).
What I would like is to keep all the levels of the x variable, here the C is missing.
id A B C
<dbl> <dbl> <dbl> <dbl>
1 1 400 0 0
2 2 200 200 0
3 3 300 100 0
Why is the drop = F not working?
library(tidyverse)
# data
dff = data.frame(id = c(1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4),
x = c('A','A','A','A', 'A','A','B','B', 'A','A','B','A', 'C', 'C', 'C', 'C'))
# remove the case to keep the C level
dff = dff[dff$id != 4, ]
You can use .drop = FALSE argument in count instead of group_by.
group_by + summarise with n() is equal to count.
spread has been deprecated in favour of pivot_wider.
Thanks to #Edo for useful tips in improving the post
library(dplyr)
library(tidyr)
dspreadN = function(data, ...) {
data %>%
count(id, x, .drop = FALSE, wt = n() * 100) %>%
pivot_wider(names_from = x, values_from = n, values_fill = 0)
}
dspreadN(dff, id, x)
# id A B C
# <dbl> <dbl> <dbl> <dbl>
#1 1 400 0 0
#2 2 200 200 0
#3 3 300 100 0
Related
I want a function where i can enter different numbers of column names and have them grouped. The first piece of code here works:
df <- data.frame(col_a = sample(1:10, 100, replace = T),
col_b = sample(letters, 100, replace = T),
col_c = sample(LETTERS, 100, replace = T))
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(n = n())
}
my_fun(df , 'col_a')
my_fun(df , 'col_a', 'col_b')
my_fun(df , 'col_a', 'col_b', 'col_c')
What I now want is to apply the complete function, so all possible values in each grouped variable are present. I've manually typed col_a and col_b into the complete() function below. I'd want to pass the possible values as a function argument though, as I'm not always going to be grouping by col_a & col_b.
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(count = n()) %>%
ungroup() %>%
complete(col_a = 1:10, col_b = letters, fill = list(count = 0))
}
my_fun(df , 'col_a', 'col_b')
You can capture the data as named list. group_by + summarise n() can be replaced with count.
library(tidyverse)
my_fun = function(df, ...) {
args <- list(...)
df %>%
count(across(all_of(names(args))), name = 'count') %>%
complete(!!!args, fill = list(count = 0))
}
This can be ran as -
my_fun(df , 'col_a' = 1:12)
# col_a count
# <int> <dbl>
# 1 1 9
# 2 2 15
# 3 3 4
# 4 4 11
# 5 5 7
# 6 6 12
# 7 7 12
# 8 8 10
# 9 9 5
#10 10 15
#11 11 0
#12 12 0
my_fun(df , 'col_a' = 1:10, 'col_b' = letters)
# col_a col_b count
# <int> <chr> <dbl>
# 1 1 a 1
# 2 1 b 0
# 3 1 c 0
# 4 1 d 0
# 5 1 e 0
# 6 1 f 1
# 7 1 g 0
# 8 1 h 0
# 9 1 i 0
#10 1 j 0
# … with 250 more rows
When summarizing data, some groups may have observations not present in another group. In the example below, group 2 has no males. How can I in a tidy way, insert these observations in a summary table?
data example:
a <- data.frame(gender=factor(c("m", "m", "m", "f", "f", "f", "f")), group=c(1,1,1,1,1,2,2))
gender group
1 m 1
2 m 1
3 m 1
4 f 1
5 f 1
6 f 2
7 f 2
data summary:
a %>% group_by(gender, group) %>% summarise(n=n())
gender group n
<fct> <dbl> <int>
1 f 1 2
2 f 2 2
3 m 1 3
Desired output:
gender group n
<fct> <dbl> <int>
1 f 1 2
2 f 2 2
3 m 1 3
4 m 2 0
At the end, we can use complete
library(dplyr)
library(tidyr)
a %>%
group_by(gender, group) %>%
summarise(n=n(), .groups = 'drop') %>%
complete(gender, group, fill = list(n = 0))
-output
# A tibble: 4 x 3
# gender group n
# <fct> <dbl> <dbl>
#1 f 1 2
#2 f 2 2
#3 m 1 3
#4 m 2 0
Or an option is also to reshape to wide and then back to long format
a %>%
pivot_wider(names_from = group, values_from = group,
values_fn = length, values_fill = 0) %>%
pivot_longer(cols = -gender, names_to = 'group', values_to = 'n')
It is more easier in base R
as.data.frame(table(a))
I have a dataframe...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e")
)
Families will only contain 2 members at most (so they're either individuals or pairs).
I need a new column 'random' that assigns the number 1 to families where there is only one member (e.g. c, d and e) and randomly assigns 0 or 1 to families containing 2 members (a and b in the example).
By the end the data should look like the following (depending on the random assignment of 0/1)...
df <- tibble(
id = 1:7,
family = c("a","a","b","b","c", "d", "e"),
random = c(1, 0, 0, 1, 1, 1, 1)
)
I would like to be able to do this with a combination of group_by and mutate since I am mostly using Tidyverse.
I tried the following (but this didn't randomly assign 0/1 within families)...
df %>%
group_by(family) %>%
mutate(
random = if_else(
condition = n() == 1,
true = 1,
false = as.double(sample(0:1,1,replace = T))
)
You could sample along the sequence length of the family group and take the answer modulo 2:
df %>%
group_by(family) %>%
mutate(random = sample(seq(n())) %% 2)
#> # A tibble: 7 x 3
#> # Groups: family [5]
#> id family random
#> <int> <chr> <dbl>
#> 1 1 a 0
#> 2 2 a 1
#> 3 3 b 0
#> 4 4 b 1
#> 5 5 c 1
#> 6 6 d 1
#> 7 7 e 1
We can use if/else
library(dplyr)
df %>%
group_by(family) %>%
mutate(random = if(n() == 1) 1 else sample(rep(0:1, length.out = n())))
# A tibble: 7 x 3
# Groups: family [5]
# id family random
# <int> <chr> <dbl>
#1 1 a 0
#2 2 a 1
#3 3 b 1
#4 4 b 0
#5 5 c 1
#6 6 d 1
#7 7 e 1
Another option
df %>%
group_by(family) %>%
mutate(random = 2 - sample(1:n()))
# A tibble: 7 x 3
# Groups: family [5]
id family random
# <int> <chr> <dbl>
# 1 1 a 1
# 2 2 a 0
# 3 3 b 1
# 4 4 b 0
# 5 5 c 1
# 6 6 d 1
# 7 7 e 1
Hi all I have a got a 2 datasets below. From these 2 datasets(dataset1 is formed from dataset2. I mean the dataset1 is the count of users from dataset2) can we build the the third datasets(expected output)
dataset1
Apps # user Enteries
A 3
B 4
C 6
dataset2
Apps Users
A X
A Y
A Z
B Y
B Y
B Z
B A
C X
C X
C X
C X
C X
C X
Expected output
Apps Entries X Y Z A
A 3 1 1 1
B 4 2 1 1
C 6 6
We can first count first for Apps and Users, get the data in wide format and join with the table for count of Apps.
library(dplyr)
df %>%
count(Apps, Users) %>%
tidyr::pivot_wider(names_from = Users, values_from = n,
values_fill = list(n = 0)) %>%
left_join(df %>% count(Apps), by = 'Apps')
# Apps X Y Z A n
# <chr> <int> <int> <int> <int> <int>
#1 A 1 1 1 0 3
#2 B 0 2 1 1 4
#3 C 6 0 0 0 6
I showing 0 is no problem and having a different column order you can use table and rowSums to produce the expected output.
x <- table(dataset2)
cbind(Entries=rowSums(x), x)
# Entries A X Y Z
#A 3 0 1 1 1
#B 4 1 0 2 1
#C 6 0 6 0 0
A solution where you need not have to calculate Total separately and do joins...
This solution uses purrr::pmap and dplyr::mutate for dynamically calculating Total.
library(tidyverse) # dplyr, tidyr, purrr
df %>% count(Apps, Users) %>%
pivot_wider(id_cols = Apps, names_from = Users, values_from = n, values_fill = list(n = 0)) %>%
mutate(Total = pmap_int(.l = select_if(., is.numeric),
.f = sum))
which have output what you need
# A tibble: 3 x 6
Apps X Y Z A Total
<chr> <int> <int> <int> <int> <int>
1 A 1 1 1 0 3
2 B 0 2 1 1 4
3 C 6 0 0 0 6
I have a dataframe df
df <- data.frame(id =c(1,2,1,4,1,5,6),
label=c("a","b", "a", "a","a", "e", "a"),
color = c("g","a","g","g","a","a","a"),
threshold = c(12, 10, 12, 12, 12, 35, 40),
value =c(32.1,0,15.0,10,1,50,45),stringsAsFactors = F
)
Threshold value is based on the label
I should get a table below like this by considering each id,with respective label how many times exceeding its threshold by the value
Color is independent in consideration for calculating the exceed values
I tried like this
final_df <- df %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
group_by(id, label) %>%
summarise(exceed = sum(check))
But instead of getting with respective id i have got the number of total in exceed
With base R only, use aggregate.
aggregate(seq.int(nrow(df)) ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
# id label seq.int(nrow(df))
#1 1 a 2
#2 4 a 0
#3 6 a 1
#4 2 b 0
#5 5 e 1
In order to match the expected output posted in the question, it will take a little extra work.
exceed <- seq.int(nrow(df))
agg <- aggregate(exceed ~ id + label, df, function(i) sum(df[i, 4] < df[i, 5]))
res <- merge(df[1:3], agg)
unique(res)
# id label color exceed
#1 1 a g 2
#3 1 a a 2
#4 2 b a 0
#5 4 a g 0
#6 5 e a 1
#7 6 a a 1
By a small modification of your code:
df %>%
group_by(id, label) %>%
mutate(check = if_else(value > threshold, 1, 0)) %>%
summarise(exceed = sum(check)) %>%
group_by(id, label)
id label exceed
<dbl> <chr> <dbl>
1 1 a 2
2 2 b 0
3 4 a 0
4 5 e 1
5 6 a 1
To match the expected output more closely:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(if_else(value > threshold, 1, 0))) %>%
group_by(id, label, color) %>%
filter(row_number() == 1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 1 a a 12 1 2
5 5 e a 35 50 1
6 6 a a 40 45 1
library(dplyr)
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a g 12 32.1 2
2 2 b a 10 0 0
3 4 a g 12 10 0
4 5 e a 35 50 1
5 6 a a 40 45 1
If you like the output to contain a separate row for each combination, of ID, label and color, just add a new group_by before the slice function:
df %>%
group_by(id, label) %>%
mutate(exceed = sum(value > threshold)) %>%
group_by(id, label, color) %>%
slice(1)
id label color threshold value exceed
<dbl> <chr> <chr> <dbl> <dbl> <int>
1 1 a a 12 1 2
2 1 a g 12 32.1 2
3 2 b a 10 0 0
4 4 a g 12 10 0
5 5 e a 35 50 1
6 6 a a 40 45 1
A little change in your code
final_df <- df %>% mutate(check = if_else(value > threshold, 1, 0)) %>% group_by(id, label) %>% filter(check==1)
unique(final_df$id)
We could use table and merge :
table_ <- table(subset(df,value>threshold, c("id","label")))
df2 <- merge(unique(df[c("id","label","color")]),table_,all.x=TRUE)
df2$Freq[is.na(df2$Freq)] <- 0
# id label color Freq
# 1 1 a g 2
# 2 1 a a 2
# 3 2 b a 0
# 4 4 a g 0
# 5 5 e a 1
# 6 6 a a 1