Summarizing and spreading data - r

I have data similar to below :
df=data.frame(
company=c("McD","McD","McD","KFC","KFC"),
Title=c("Crew Member","Manager","Trainer","Crew Member","Manager"),
Manhours=c(12,NA,5,13,10)
)
df
I would wish to manipulate it and obtain the data frame as below:
df=data.frame(
company=c("KFC", "McD"),
Manager=c(1,1),
Surbodinate=c(1,2),
TotalEmp=c(2,3),
TotalHours=c(23,17)
)
I have managed to manipulate and categorise the employees as well as their count as below:
df<- df %>%
mutate(Role = if_else((Title=="Manager" ),
"Manager","Surbodinate"))%>%
count(company, Role) %>%
spread(Role, n, fill=0)%>%
as.data.frame() %>%
mutate(TotalEmp= select(., Manager:Surbodinate) %>%
apply(1, sum, na.rm=TRUE))
Also, I have summarised the man hours as below:
df <- df %>%group_by(company) %>%
summarize(TotalHours = sum(Manhours, na.rm = TRUE))
How would I combine these two steps at once or is there a cleaner/simpler way of getting the desired output?

dplyr solution:
df %>%
mutate(Title = if_else((Title=="Manager" ),
"Manager","Surbodinate")) %>%
group_by(company) %>%
summarise(Manager = sum(Title == "Manager"), Subordinate = sum(Title == "Surbodinate"), TotalEmp = n(), Manhours = sum(Manhours, na.rm = TRUE))
company Manager Subordinate TotalEmp Manhours
<fct> <int> <int> <int> <dbl>
1 KFC 1 1 2 23
2 McD 1 2 3 17

how about something like this:
df %>%
mutate(Role = ifelse(Title=="Manager" ,
"Manager", "Surbodinate"))%>%
group_by(company) %>%
mutate(TotalEmp = n(),
TotalHours = sum(Manhours, na.rm=TRUE)) %>%
reshape2::dcast(company + TotalEmp + TotalHours ~ Role)

This is not tidyverse nor is it a one step process. But if you use data.table you could do:
library(data.table)
setDT(df, key = "company")
totals <- DT[, .(TotalEmp = .N, TotalHours = sum(Manhours, na.rm = TRUE)), by = company]
dcast(DT, company ~ ifelse(Title == "Manager", "Manager", "Surbodinate"))[totals]
# company Manager Surbodinate TotalEmp TotalHours
# 1 KFC 1 1 2 23
# 2 McD 1 2 3 17

Related

Summarize information by group in data table in R

I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6

dplyr count unique and repeat id's by months

I have a df that looks like the following:
ID DATE
12 10-20-20
12 10-22-20
10 10-15-20
9 10-10-20
11 11-01-20
7 11-02-20
I would like to group by month and then create a column for unique id count and repeat id count like below:
MONTH Unique_Count Repeat_Count
10-1-20 2 2
11-1-20 2 0
I am able to get the date down to the first of the month and group by ID but I am not sure how to count unique instances within the months.
df %>%
mutate(month = floor_date(as.Date(DATE), "month")) %>%
group_by(ID) %>%
mutate(count = n())
Are you perhaps looking for:
df %>%
mutate(month = strftime(floor_date(as.Date(DATE, "%m-%d-%y"), "month"),
"%m-%d-%y")) %>%
group_by(month) %>%
summarize(unique_count = length(which(table(ID) == 1)),
repeat_count = sum(table(ID)[(which(table(ID) > 1))]))
#> # A tibble: 2 x 3
#> month unique_count repeat_count
#> <chr> <int> <int>
#> 1 10-01-20 2 2
#> 2 11-01-20 2 0
Here's a shot at it:
library(lubridate)
library(dplyr)
dates <- as.Date(c("2020-10-15", "2020-10-15", "2020-11-16", "2020-11-16", "2020-11-16"))
ids <- c(12, 12, 13, 13, 14)
df <- data.frame(dates, ids)
duplicates <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(duplicate_count = n()) %>%
filter(duplicate_count > 1) %>%
distinct(ids, .keep_all = TRUE)
uniques <- df %>%
group_by(dates_floored = floor_date(dates, unit = "month"), ids) %>%
mutate(unique_count = n()) %>%
filter(unique_count < 2) %>%
distinct(ids, .keep_all = TRUE)
df_cleaned <- full_join(uniques, duplicates, by = c("ids", "dates", "dates_floored")) %>%
group_by(dates_floored) %>%
summarize(count_duplicates = sum(duplicate_count, na.rm = TRUE),
count_unique = sum(unique_count, na.rm = TRUE))
df_cleaned

More efficient way to perform calculations on multiple (combined) columns by group

What is a more efficient way to perform calculations on multiple combined columns by group?
I have a dataset with Manager Effectiveness & Team Effectiveness components. How can I quickly calculate the number of 5s for each component by gender?
The desired outcome is like so:
Number of 5s for 'Manager effectiveness' = 2
Number of 5s for 'Team effectiveness' = 0
So far, I've tried the dplyr method:
Data %>%
group_by(gender) %>%
summarise(sum(c(Manager EQ, Manager IQ)) == 5)
Data %>%
group_by(gender) %>%
summarise(sum(c(Team collaboration, Team friendliness)) == 5)
Though it works, typing each column name quickly becomes tedious and error-prone as more columns are involved.
We can use summarise_at
library(dplyr)
Data %>%
group_by(gender) %>%
summarise_at(vars(starts_with('Manager')), ~ sum(. == 5))
Or if we are checking the sum of all numeric columns, use summarise_if
Data %>%
group_by(gender) %>%
summarise_if(is.numeric, ~ sum(. == 5))
Can we wrapped in a function
f1 <- function(dat, colPrefix, grp, val) {
dat %>%
group_by_at(grp) %>%
summarise_at(vars(starts_with(colPrefix)), ~ sum(. == val))
}
f1(Data, "Manager", "gender", 5)
Mostly expanding on #akrun's answer:
## made up data 100 observations
set.seed(133)
dat <- 1:5
gen <- c("M", "F")
z <- tibble(me = sample(dat, 100, TRUE),
mi = sample(dat, 100, TRUE),
tc = sample(dat, 100, TRUE),
tf = sample(dat, 100, TRUE),
gender = sample(gen, 100, TRUE))
# Grouping by gender, counting 5's, and reshaping data
z %>%
group_by(gender) %>%
summarise_at(vars(everything()), ~ sum(. == 5)) %>%
pivot_longer(me:tf) %>%
mutate(name = paste0("# 5's for ", name)) %>%
pivot_wider(gender)
Output:
# A tibble: 2 x 5
gender `# 5's for me` `# 5's for mi` `# 5's for tc` `# 5's for tf`
<chr> <int> <int> <int> <int>
1 F 6 6 8 5
2 M 10 14 20 5
This is starting to get a little hack-ey, but in response to Amanda's comment & my misunderstanding of the question:
z %>%
group_by(gender) %>%
summarise_at(vars(everything()), ~ sum(. == 5)) %>%
pivot_longer(me:tf) %>%
mutate(name = paste0("# 5's for ", name)) %>%
mutate(grp = ifelse(str_detect(name, 'm'), 'manager', 'team')) %>%
group_by(gender, grp) %>%
summarise(total_5s = sum(value))
Gives results:
# A tibble: 4 x 3
# Groups: gender [2]
gender grp total_5s
<chr> <chr> <int>
1 F manager 12
2 F team 13
3 M manager 24
4 M team 25
Unfortunately this relies heavily on making a distinction and group based on the column names of the original data.

How can I convert data frame of survey responses to a frequency table?

I have an R dataframe of survey results. Each column is a response to a question on the survey. It can take values 1 to 10 and NA. I would like turn this into a frequency table.
This is an example of the data I have. I'm pretending the values go from 1 to 3, instead of 1 to 10.
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
What I want:
data.frame(
"Question" = c("Question1", "Question2"),
"Frequency of 1" = c(2, 1),
"Frequency of 2" = c(0 , 1),
"Frequency of 3" = c(0, 1)
)
I have tried using likert() from the likert package, but I'm getting fractional results which cannot be correct. Is there a simple solution to this problem?
Here is a solution using the dplyr and purrr packages
library(dplyr)
library(purrr)
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
df %>%
select(-Person) %>%
mutate_all(~ factor(.x, levels = as.character(1:10) ) %>% addNA() ) %>%
map(table) %>%
transpose() %>%
map(as.integer) %>%
set_names( ~ paste0("Frequency of ",ifelse(is.na(.), "NA", .))) %>%
as_tibble() %>%
mutate(Question = setdiff(names(df),"Person")) %>%
select(Question,everything(), "Frequency of NA" = `Frequency of ` )
A data.table solution:
require(data.table)
setDT(df)
# Melt data:
df <- melt(df, id.vars = "Person", value.name = "Question")
# Cast data to required structure:
df <- data.frame(dcast(df, variable ~ Question))
# Rename variables and remove NA count (as per Ops question):
names(df)[1] <- "Question"
names(df)[-1] <- gsub("X", "Frequency of ", names(df)[-1])
df$NA. <- NULL
df
# Question Frequency of 1 Frequency of 2 Frequency of 3
#1 Question1 2 0 0
#2 Question2 1 1 1
Or a one line answer:
dcast(melt(setDT(df), id.vars="Person", value.name="Question")[!Question %in% NA][, Question := paste0("Frequency of ", Question)], variable ~ Question)
A different tidyverse possibility could be:
df %>%
gather(Question, val, -Person, na.rm = TRUE) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3
<chr> <int> <int> <int>
1 Question1 2 NA NA
2 Question2 1 1 1
Here it, first, transforms the data from wide to long format. Second, it calculates the frequencies according the questions. Finally, it creates the "Frequency.of." variables and returns the data to its desired shape.
Or if you want to calculate also the NA values per questions:
df %>%
gather(Question, val, -Person) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3 Frequency.of.NA
<chr> <int> <int> <int> <int>
1 Question1 2 NA NA 1
2 Question2 1 1 1 NA
This is not the most elegant but might help: df2 is your data set.
Data:
df2<-data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3"),stringsAsFactors = F
)
Target:
EDIT:: You could "automate" as follows
df2[is.na(df2)]<-0 #To allow numeric manipulation
values<-c("1","2","3")
Final_df<-sapply(values,function(val) apply(df2[,-1],2,function(x) sum(x==val)))
Final_df<-as.data.frame(Final_df)
names(Final_df)<-paste0("Frequency of_",1:ncol(Final_df))
This yields:
Frequency of_1 Frequency of_2 Frequency of_3
Question1 2 0 0
Question2 1 1 1

Factors in many columns to boolean convert

My question is a little like this but the problem with the structure of data is different:
Sample data:
df <-data.frame(id = c(1,2,3), stock_1 = c("Google","Microsoft","Yahoo"), stock_2 = c("Yahoo","Google","NA"))
and I would like to convert to this:
df <-data.frame(id = c(1,2,3), Google = c(1,1,0), Microsoft = c(0,1,0), Yahoo= c(1,0,1))
I tried to use sapply() but from the answer to the linked question it is only for one column.
Here is a way to do it with data.table:
library(data.table)
setDT(df)
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = length)
# id Google Microsoft Yahoo
# 1: 1 1 0 1
# 2: 2 1 1 0
# 3: 3 0 0 1
fill = 0 is unnecessary, and to tolerate duplicates, we can try:
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
2017-01-01
As mentationed by Uwe, we can removed from NAs from the molten data by setting na.rm = TRUE if it is not hard coded as a string ("NA"), the commands finally looks this:
dcast(melt(df, id = 'id', na.rm = TRUE), id ~ value, fun.aggregate = length)
# or
dcast(melt(df, id = 'id', na.rm = TRUE),
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
We can also do this with tidyverse
library(tidyverse)
df %>%
gather(key, val, -id) %>%
filter(!is.na(val)) %>%
mutate(ind = 1) %>%
select(-key) %>%
spread(val, ind, fill = 0)
NOTE: It is better to use NA instead of "NA" as we can take care of NA with is.na or na.omit or complete.cases

Resources