Ordering rows when they are not numeric - r

using df below, I made a table with frequencies for each unit according to each combination of group/year.
After obtaining absolute and relative frequencies, I have pasted the values into just one column Frequency
Is there a way that I can after changing the table to have the units on the rows, have them ordered in descending order based on n of the Total group in 2016? I want my final output to not have rows with n and prop, only Frequency
df <- data.frame(cbind(sample(c('Controle','Tratado'),
10, replace = T),
sample(c(2012,2016), 10, T),
c('A','B','A','B','C','D','D','A','F','A')))
colnames(df) <- c('Group', 'Year', 'Unit')
table <- df %>%
group_by(Year, Group) %>%
count(Unit) %>%
mutate(prop = prop.table(n)) %>%
bind_rows(df %>%
mutate(Group ="Total") %>%
group_by(Year, Group) %>%
count(Unit)) %>%
mutate(prop = prop.table(n))
is.num <- sapply(table, is.numeric)
table[is.num] <- lapply(table[is.num], round, 4)
table <- table %>%
mutate(Frequency = paste0(n,' (', 100*prop,'%)'))
table <- table %>%
gather(type, measurement, -Year, -Group, -Unit) %>%
unite(year_group, Year:Group, sep = ":") %>%
spread(year_group, measurement)
Here is what I am expecting to generate:
Unit type 2012:Total 2012:Tratado 2016:Controle 2016:Total 2016:Tratado
1 A Frequency 2 (66.67%) 2 (66.67%) - 2 (28.57%) 2 (100%)
2 D Frequency - - 2 (40%) 2 (28.57%) -
3 B Frequency 1 (33.33%) 1 (33.33%) 1 (20%) 1 (14.29%) -
4 C Frequency - - 1 (20%) 1 (14.29%) -
5 F Frequency - - 1 (20%) 1 (14.29%) -
Notice that the results are ordered according to column 2016:Total

Just found out a way myself, probably not the best one.
After running the code on the question, I have done the following:
table <- subset.data.frame(table, type == 'Frequency')
table <- table %>%
mutate(value = substr(Total_2016, 1, nchar(Total_2016) - 7 )) %>%
mutate(value = as.numeric(value)) %>%
arrange(desc(value))

Related

Ontime percentage calculations

I need to calculate the overall ontime percentage of each airline with this sample dataset.
library(tidyverse)
library(dplyr)
df_chi <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,527,'CHI',
'delta',FALSE,92,'CHI',
'american',TRUE,4229,'CHI',
'american',FALSE,825,'CHI'
)
df_nyc <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,1817,'NYC',
'delta',FALSE,567,'NYC',
'american',TRUE,1651,'NYC',
'american',FALSE,625,'NYC'
)
I have a solution although it is verbose and I want to avoid the numbered index ie [2,2]. Is there a more elegant way using more of the tidyverse?
df_all <- bind_rows(df_chi,df_nyc)
delta_ot <- df_all %>%
filter(airline == "delta") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
delta_ot <- delta_ot[2,2] / sum(delta_ot$total)
american_ot <- df_all %>%
filter(airline == "american") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
american_ot <- american_ot[2,2] / sum(american_ot$total)
As on the ontime column is logical column, use that to subset instead of [2, 2]. Also, instead of doing the filter, do this once by adding the 'airline' as a grouping column
library(dplyr)
bind_rows(df_chi, df_nyc) %>%
group_by(airline, ontime) %>%
summarise(total = sum(qty), .groups = 'drop_last') %>%
summarise(total = total[ontime]/sum(total))
-output
# A tibble: 2 × 2
airline total
<chr> <dbl>
1 american 0.802
2 delta 0.781
Subsetting by logical returns the corresponding value where there are TRUE elements
> c(1, 3, 5)[c(FALSE, TRUE, FALSE)]
[1] 3

Excel Pivot table with R (adding totals and %) instead of counts - MRE included

I'm trying to recreate a Pivot table in R (as the output I get from Excel). So far this is how my code looks like:
id <- c(1,2,3,4,5)
reason <- c("A","A","A","B","B")
type <- c("1. Small","1. Small","1. Mid","1. Mid","1. Small")
df <- data.frame(id,reason,type)
df2 <- df %>% group_by(reason, type) %>% summarise(count = n()) %>% ungroup()
df3 <- df2 %>% dcast(type ~ reason, value.var = "count")
Resulting df3 gets me the counts and expected structure, but I'm missing a total column and %. So the expected result should be:
type Total A B
1. Small 60% 66% 50%
2. Mid 40% 33% 50%
But instead I'm getting:
type A B
1. Small 2 1
2. Mid 1 1
Is it possible to tweak the code to get the expected result? or is there another function in R that I'm missing?
Thx!
With tidyverse
library(dplyr)
library(tidyr)
df %>%
count(reason, type) %>%
group_by(type) %>%
mutate(prop = 100 *n/sum(n), Total = 100 * sum(n)/sum(.$n)) %>%
select(-n) %>%
ungroup %>%
pivot_wider(names_from = reason, values_from = prop)
# A tibble: 2 x 4
# type Total A B
# <fct> <dbl> <dbl> <dbl>
#1 1. Mid 40 50 50
#2 1. Small 60 66.7 33.3
Or with data.table
library(data.table)
dt1 <- setDT(df)[, .N, .(reason, type)]
dcast(dt1[, c('prop', 'Total') := .(100 * N/sum(N),
100 * sum(N)/sum(dt1$N)), type], type + Total ~ reason, value.var = 'prop')
# type Total A B
#1: 1. Mid 40 50.00000 50.00000
#2: 1. Small 60 66.66667 33.33333
try to do so
library(tidiverse)
xtabs(count ~ type + reason, df2) %>%
addmargins(margin = 2) %>%
prop.table(margin = 2) %>%
round(digits = 3)

Creating a funnel using a pivot table in R considering NA column

I have the following dataset:
library(tidyverse)
dataset <- data.frame(id = c(121,122,123,124,125),
segment = c("A","B","B","A",NA),
Web = c(1,1,1,1,1),
Tryout = c(1,1,1,0,1),
Purchase = c(1,0,1,0,0),
stringsAsFactors = FALSE)
This table as you see converts to a funnel, from web visits (the quantity of rows), to tryout to a purchase. So a useful view of this funnel should be:
Step Total A B NA
Web 5 2 2 1
Tryout 4 1 2 1
Purchase 2 1 1 0
So I tried row by row doing this. The web views code is:
dataset %>% mutate(segment = ifelse(is.na(segment), "NA", segment)) %>%
group_by(segment) %>% summarise(Total = n()) %>%
ungroup() %>% spread(segment, Total) %>% mutate(Total = `A` + `B` + `NA`) %>%
select(Total,A,B,`NA`)
And worked fine, except that I have to put manually the row name. But for the other steps like tryout and purchase, is there a way to do it in just one simpler code, avoiding binding? Consider that this is an example and I have many columns so any help will be greatly appreciated.
Here is one option where we convert the data to 'long' format after removing the 'id' column, grouped by 'name' get the sum of 'value', then grouped by 'segment', 'Total' as well and do the second sum, get the distinct rows and pivot back to 'wide' format
library(dplyr)
library(tidyr)
dataset %>%
select(-id) %>%
pivot_longer(cols = -segment) %>%
group_by(name) %>%
mutate(Total = sum(value)) %>%
group_by(name, segment, Total) %>%
mutate(n = sum(value)) %>%
ungroup %>%
select(-value) %>%
distinct %>%
pivot_wider(names_from = segment, values_from = n)
# A tibble: 3 x 5
# name Total A B `NA`
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 Web 5 2 2 1
#2 Tryout 4 1 2 1
#3 Purchase 2 1 1 0
dataset %>%
select(-id) %>%
group_by(segment) %>%
summarise_all(sum) %>%
gather(Step, val, -segment) %>%
spread(segment, val) %>%
mutate(Total = rowSums(.[,-1]))

Format multilevel group_by in R

In R, when I run this group_by code, I obtain this result.
df <- tibble(y=c('a','a','a', 'b','b','b','b','b'), z=c(1,1,1,1,1,1,2,2))
df %>% group_by(z,y) %>% summarise(n())
z y n()
1 a 3
1 b 3
2 b 2
Is there a way to make it look like this?
z y n()
1 a 3
b 3
2 b 2
My goal is to have the formatting look the way it does in Pandas, where the multilevel index isn't repeated each time ( see below ).
Here's one possibility:
df <- tibble(y=c('a','a','a', 'b','b','b','b','b','a','b'), z=c(1,1,1,1,1,1,2,2,3,3))
df2 <-
df %>%
group_by(z,y) %>%
summarise(n = n()) %>%
group_by(z) %>%
mutate(z2 = if_else(row_number() == 1, as.character(z), " "), y, n) %>%
ungroup() %>%
transmute(z = z2, y, n)
df2 %>%
knitr::kable()
I'm having trouble thinking of ways to do this that don't involve grouping by the z column and finding the first row. Unfortunately that means you need to add a couple steps, because a grouping variable can't be modified in the mutate call.

Select rows by ID with most matches

I have a data frame like this:
df <- data.frame(id = c(1,1,1,2,2,3,3,3,3,4,4,4),
torre = c("a","a","b","d","a","q","t","q","g","a","b","c"))
and I would like my code to select for each id the torre that repeats more, or the last torre for the id if there isnt one that repeats more than the other, so ill get a new data frame like this:
df2 <- data.frame(id = c(1,2,3,4), torre = c("a","a","q","c"))
You can use aggregate:
aggregate(torre ~ id, data=df,
FUN=function(x) names(tail(sort(table(factor(x, levels=unique(x)))),1))
)
The full explanation for this function is a bit involved, but most of the job is done by the FUN= parameter. In this case we are making a function that get's the frequency counts for each torre, sorts them in increasing order, then get's the last one with tail(, 1) and takes the name of it. aggregate() function then applies this function separately for each id.
You could do this using the dplyr package: group by id and torre to calculate the number of occurrences of each torre/id combination, then group by id only and select the last occurrence of torre that has the highest in-group frequency.
library(dplyr)
df %>%
group_by(id,torre) %>%
mutate(n=n()) %>%
group_by(id) %>%
filter(n==max(n)) %>%
slice(n()) %>%
select(-n)
id torre
<dbl> <chr>
1 1 a
2 2 a
3 3 q
4 4 c
An approach with the data.table package:
library(data.table)
setDT(df)[, .N, by = .(id, torre)][order(N), .(torre = torre[.N]), by = id]
which gives:
id torre
1: 1 a
2: 2 a
3: 3 q
4: 4 c
And two possible dplyr alternatives:
library(dplyr)
# option 1
df %>%
group_by(id, torre) %>%
mutate(n = n()) %>%
group_by(id) %>%
mutate(f = rank(n, ties.method = "first")) %>%
filter(f == max(f)) %>%
select(-n, -f)
# option 2
df %>%
group_by(id, torre) %>%
mutate(n = n()) %>%
distinct() %>%
arrange(n) %>%
group_by(id) %>%
slice(n()) %>%
select(-n)
Yet another dplyr solution, this time using add_count() instead of mutate():
df %>%
add_count(id, torre) %>%
group_by(id) %>%
filter(n == max(n)) %>%
slice(n()) %>%
select(-n)
# A tibble: 4 x 2
# Groups: id [4]
id torre
<dbl> <fct>
1 1. a
2 2. a
3 3. q
4 4. c

Resources