Group_by multiple columns and summarise unique column - r

I have a dataset below
family
type
inc
name
AA
success
30000
Bill
AA
ERROR
15000
Bess
CC
Pending
22000
Art
CC
Pending
18000
Amy
AA
Serve not respnding d
25000
Paul
ZZ
Success
50000
Pat
ZZ
Processing
50000
Pat
I want to group by multiple columns
here is my code bellow
df<-df1%>%
group_by(Family , type)%>%
summarise(Transaction_count = n(), Face_value = sum(Inc))%>%
mutate(Pct = Transaction_count/sum(Transaction_count))
what I want is that anywhere there is same observation Family, it should pick only one
like this result in the picture below.
Thank you

You can use duplicated to replace the repeating values with blank value.
library(dplyr)
df %>%
group_by(family , type)%>%
summarise(Transaction_count = n(), Face_value = sum(inc))%>%
mutate(Pct = Transaction_count/sum(Transaction_count),
family = replace(family, duplicated(family), '')) %>%
ungroup
# family type Transaction_count Face_value Pct
# <chr> <chr> <int> <int> <dbl>
#1 "AA" ERROR 1 15000 0.333
#2 "" Serve not respnding d 1 25000 0.333
#3 "" success 1 30000 0.333
#4 "CC" Pending 2 40000 1
#5 "ZZ" Processing 1 50000 0.5
#6 "" Success 1 50000 0.5
If you want data for displaying purpose you may look into packages like formattable, kable etc.
data
It is easier to help if you provide data in a reproducible format
df <- structure(list(family = c("AA", "AA", "CC", "CC", "AA", "ZZ",
"ZZ"), type = c("success", "ERROR", "Pending", "Pending", "Serve not respnding d",
"Success", "Processing"), inc = c(30000L, 15000L, 22000L, 18000L,
25000L, 50000L, 50000L), name = c("Bill", "Bess", "Art", "Amy",
"Paul", "Pat", "Pat")), row.names = c(NA, -7L), class = "data.frame")

Related

Quicker way? Remove rows in book1, take row 4 values as column name, set some column name same as book2

Below is the first dataframe where I want to remove the first 3 rows:
book1 <- structure(list(Instructions..xyz = c("Note: abc", "", "Set1",
"id", "632592651", "633322173", "634703802", "634927873", "635812953",
"636004739", "636101211", "636157799", "636263106", "636752420"
), X = c("", "", "", "title", "asdf", "cat", "dog", "mouse",
"elephant", "goose", "rat", "mice", "kitty", "kitten"), X.1 = c("",
"", "", "hazard", "y", "y", "y", "n", "n", "y", "y", "n", "n",
"y"), X.2 = c("", "", "Set2", "id", "632592651", "633322173",
"634703802", "634927873", "635812953", "636004739", "636101211",
"636157799", "636263106", "636752420"), X.3 = c("", "", "", "title",
"asdf2", "cat2", "dog2", "mouse2", "elephant2", "goose2", "rat2",
"mice2", "kitty2", "kitten2"), X.4 = c("", "", "", "index", "0.664883807",
"0.20089779", "0.752228086", "0.124729276", "0.626285086", "0.134537909",
"0.612526768", "0.769622463", "0.682532524", "0.819015658")), class = "data.frame", row.names = c(NA,
-14L))
I did book1 <- book1[-c(1:3),] but I'm not sure how to make id, title, hazard, id, title, index as the column name instead of Instructions..xyz, etc. See image below for desired output
Then for the second dataframe,
book2 <- structure(list(identity = c(632592651L, 633322173L, 634703802L,
634927873L, 635812953L, 636004739L, 636101211L, 636157799L, 636263106L,
636752420L, 636809222L, 2004722036L, 2004894388L, 2005045755L,
2005535472L, 2005630542L, 2005788781L, 2005809679L, 2005838317L,
2005866692L), text = c("asdf_xyz", "cat", "dog", "mouse", "elephant",
"goose", "rat", "mice", "kitty", "kitten", "tiger_xyz", "lion",
"leopard", "ostrich", "kangaroo", "platypus", "fish", "reptile",
"mammals", "amphibians_xyz"), volume = c(1234L, 432L, 324L, 333L,
2223L, 412346L, 7456L, 3456L, 2345L, 2345L, 6L, 345L, 23L, 2L,
4778L, 234L, 8675L, 3459L, 8L, 9L)), class = "data.frame", row.names = c(NA,
-20L))
I then rename column 1 and 2 in book2 so that it matches that of book1 by names(book2)[1:2] <- c('id','title') where I can later do inner_join. The desired output is shown in the image below by
library(dplyr)
book1 %>%
inner_join(book2, by = c("id", "title"))
This is taking quite a few steps and wondering if there's a simplified version to this?
Something like this?
# split the data by columns
book2a <- book1[-(1:4), 1:3]
book2b <- book1[-(1:4), 4:6]
# take care of names
names(book2a) <- book1[4, 1:3, drop = TRUE]
names(book2b) <- book1[4, 4:6, drop = TRUE]
# book2b needs processing
book2b$title <- sub("2", "", book2b$title)
book2b$index <- as.numeric(book2b$index)
# join both data sets and clean-up
book2 <- merge(book2a, book2b, all = TRUE)
rm(book2a, book2b)
book2
#> id title hazard index
#> 1 632592651 asdf y 0.6648838
#> 2 633322173 cat y 0.2008978
#> 3 634703802 dog y 0.7522281
#> 4 634927873 mouse n 0.1247293
#> 5 635812953 elephant n 0.6262851
#> 6 636004739 goose y 0.1345379
#> 7 636101211 rat y 0.6125268
#> 8 636157799 mice n 0.7696225
#> 9 636263106 kitty n 0.6825325
#> 10 636752420 kitten y 0.8190157
Created on 2022-06-25 by the reprex package (v2.0.1)
Found the solution to the first question
library(janitor)
book1 <- row_to_names(dat=book1, row_number=4, remove_row = TRUE, remove_rows_above = TRUE)
I applied
names(book1)[4:5] <- c('id1','title1')
to obtain unique column name, then tried inner_join as proposed earlier but with error and found that book1$id is character where book2$id is int and so I did
book1$id <- as.integer(book1$id)
and finally it works with
library(tidyverse)
Yeah <- book1 %>%
inner_join(book2, by = c("id", "title"))
Output below:
id title hazard id1 title1 index volume
1 633322173 cat y 633322173 cat2 0.20089779 432
2 634703802 dog y 634703802 dog2 0.752228086 324
3 634927873 mouse n 634927873 mouse2 0.124729276 333
4 635812953 elephant n 635812953 elephant2 0.626285086 2223
5 636004739 goose y 636004739 goose2 0.134537909 412346
6 636101211 rat y 636101211 rat2 0.612526768 7456
7 636157799 mice n 636157799 mice2 0.769622463 3456
8 636263106 kitty n 636263106 kitty2 0.682532524 2345
9 636752420 kitten y 636752420 kitten2 0.819015658 2345
Still wondering if there's a quicker way?

Count Observations Meeting Certain Criteria by Group R

I need some help counting observations meeting certain criteria by group. I first want the number of employees by location as a column. Then I would like to retrieve the number of employees that have worked more than 40 hours (by location) and summarize that into a column. I assume there is an easy way to do it with dplyr or base R but I'm stumped. My data is below.
name hours_worked location
Bob 55 IL
Nick 25 IL
Sally 30 IL
Patricia 50 WI
Tim 35 WI
Liz 42 OH
Brad 60 OH
Sam 48 OH
Ideal output would be something like:
location headcount over_40
IL 3 1
WI 2 1
OH 3 3
We can do a group by operation - grouped by 'location' get the number of rows (n()) for headcount and the sum of logical vector to get the count of 'over_40'
library(dplyr)
df1 %>%
group_by(location) %>%
summarise(headcount = n(), over_40 = sum(hours_worked > 40))
-output
# A tibble: 3 x 3
location headcount over_40
<chr> <int> <int>
1 IL 3 1
2 OH 3 3
3 WI 2 1
data
df1 <- structure(list(name = c("Bob", "Nick", "Sally", "Patricia", "Tim",
"Liz", "Brad", "Sam"), hours_worked = c(55L, 25L, 30L, 50L, 35L,
42L, 60L, 48L), location = c("IL", "IL", "IL", "WI", "WI", "OH",
"OH", "OH")), class = "data.frame", row.names = c(NA, -8L))

Converting from long to wide, using pivot_wide() on two columns in R

I would like to transform my data from long format to wide by the values in two columns. How can I do this using tidyverse?
Updated dput
structure(list(Country = c("Algeria", "Benin", "Ghana", "Algeria",
"Benin", "Ghana", "Algeria", "Benin", "Ghana"
), Indicator = c("Indicator 1",
"Indicator 1",
"Indicator 1",
"Indicator 2",
"Indicator 2",
"Indicator 2",
"Indicator 3",
"Indicator 3",
"Indicator 3"
), Status = c("Actual", "Forecast", "Target", "Actual", "Forecast",
"Target", "Actual", "Forecast", "Target"), Value = c(34, 15, 5,
28, 5, 2, 43, 5,
1)), row.names
= c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
Country Indicator Status Value
<chr> <chr> <chr> <dbl>
1 Algeria Indicator 1 Actual 34
2 Benin Indicator 1 Forecast 15
3 Ghana Indicator 1 Target 5
4 Algeria Indicator 2 Actual 28
5 Benin Indicator 2 Forecast 5
6 Ghana Indicator 2 Target 2
7 Algeria Indicator 3 Actual 43
8 Benin Indicator 3 Forecast 5
9 Ghana Indicator 3 Target 1
Expected output
Country Indicator1_Actual Indicator1_Forecast Indicator1_Target Indicator2_Actual
Algeria 34 15 5 28
etc
Appreciate any tips!
foo <- data %>% pivot_wider(names_from = c("Indicator","Status"), values_from = "Value")
works perfectly!
I think the mistake is in your pivot_wider() command
data %>% pivot_wider(names_from = Indicator, values_from = c(Indicator, Status))
I bet you can't use the same column for both names and values.
Try this code
data %>% pivot_wider(names_from = c(Indicator, Status), values_from = Value))
Explanation: Since you want the column names to be Indicator 1_Actual, you need both columns indicator and status going into your names_from
It would be helpful if you provided example data and expected output. But I tested this on my dummy data and it gives the expected output -
Data:
# A tibble: 4 x 4
a1 a2 a3 a4
<int> <int> <chr> <dbl>
1 1 5 s 10
2 2 4 s 20
3 3 3 n 30
4 4 2 n 40
Call : a %>% pivot_wider(names_from = c(a2, a3), values_from = a4)
Output :
# A tibble: 4 x 5
a1 `5_s` `4_s` `3_n` `2_n`
<int> <dbl> <dbl> <dbl> <dbl>
1 1 10 NA NA NA
2 2 NA 20 NA NA
3 3 NA NA 30 NA
4 4 NA NA NA 40
Data here if you want to reproduce
structure(list(a1 = 1:4, a2 = 5:2, a3 = c("s", "s", "n", "n"),
a4 = c(10, 20, 30, 40)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
Edit : For the edited question after trying out the correct pivot_wider() command - It looks like your data could actually have duplicates, in which case the output you are seeing would make sense - I would suggest you try to figure out if your data actually has duplicates by using filter(Country == .., Indicator == .., Status == ..)
This can be achieved by calling both your columns to pivot wider in the names_from argument in pivot_wider().
data %>%
pivot_wider(names_from = c("Indicator","Status"),
values_from = "Value")
Result
Country `Indicator 1_Ac… `Indicator 1_Fo… `Indicator 1_Ta… `Indicator 2_Ac… `Indicator 2_Fo…
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Algeria 34 15 5 28 5

Calculate Growth Increase, grouping by Type (Using R)

I have a dataset, z, that I wish to calculate the growth increase by the type:
location size type date
ny 5 hello 10/01/2020
ny 7 ai 10/02/2020
ny 8 ai 10/03/2020
ny 6 hello 10/04/2020
ca 15 cool 10/05/2020
ca 10 name 10/06/2020
ca 5 name 10/07/2020
ca 16 cool 10/08/2020
Desired output
location type increase percent_increase start_date end_date
ca cool 1 6.67% 10/05/2020 10/08/2020
ca name -5 -50% 10/6/2020 10/7/2020
ny hello 1 20% 10/01/2020 10/4/2020
ny ai 1 14.28% 10/2/2020 10/3/2020
This is what I am doing:
library(tidyverse)
z %>%
group_by(type, location) %>%
mutate(percent_increase = (size/lead(size) - 1) * 100)
I am not getting my desired output. Any assistance is appreciated.
To get the results you want, you need a different calculation in your mutate line:
I also added a filter to remove any results with NA for the percent_increase variable.
And finally added ```arrange`` to sort alphabetically by location to match the same order as your requested output.
CODE
z %>%
group_by(type, location) %>%
mutate(
increase = (lead(size) - size),
percent_increase = (increase/size) * 100,
start_date = date,
end_date = lead(date)) %>%
filter(!is.na(percent_increase)) %>%
arrange(location)
OUTPUT
# A tibble: 4 x 8
# Groups: type, location [4]
location size type date increase percent_increase start_date end_date
<chr> <int> <chr> <chr> <int> <dbl> <chr> <chr>
1 ca 15 cool 10/05/2020 1 6.67 10/05/2020 10/08/2020
2 ca 10 name 10/06/2020 -5 -50 10/06/2020 10/07/2020
3 ny 5 hello 10/01/2020 1 20 10/01/2020 10/04/2020
4 ny 7 ai 10/02/2020 1 14.3 10/02/2020 10/03/2020
INPUT
z <- structure(list(location = c("ny", "ny", "ny", "ny", "ca", "ca",
"ca", "ca"), size = c(5L, 7L, 8L, 6L, 15L, 10L, 5L, 16L), type = c("hello",
"ai", "ai", "hello", "cool", "name", "name", "cool"), date = c("10/01/2020",
"10/02/2020", "10/03/2020", "10/04/2020", "10/05/2020", "10/06/2020",
"10/07/2020", "10/08/2020")), class = "data.frame", row.names = c(NA,
-8L))
you're missing arrange function to organize by date
like this:
z %>%
group_by(type, location) %>%
arrange(date) %>%
mutate(percent_increase = (size/lead(size) - 1) * 100)

Sort values across multiple columns in R with dplyr

Apologies for the not-particularly-clear title - hoping my example below helps. I am working with some sports data, attempting to compute "lineup statistics" for certain grouping of players in the data. Below is an example of the type of data I'm working with (playerInfo), as well as the type of analysis I am attempting to do (groupedInfo):
playerInfo = data.frame(
lineup = c(1,2,3,4,5,6),
player1 = c("Bil", "Tom", "Tom", "Nik", "Nik", "Joe"),
player1id = c("e91", "a27", "a27", "b17", "b17", "3b3"),
player2 = c("Nik", "Bil", "Nik", "Joe", "Tom", "Tom"),
player2id = c("b17", "e91", "b17", "3b3", "a27", "a27"),
player3 = c("Joe", "Joe", "Joe", "Tom", "Joe", "Nik"),
player3id = c("3b3", "3b3", "3b3", "a27", "3b3", "b17"),
points = c(6, 8, 3, 12, 36, 2),
stringsAsFactors = FALSE
)
groupedInfo <- playerInfo %>%
dplyr::group_by(player1, player2, player3) %>%
dplyr::summarise(
lineup_ct = n(),
total_pts = sum(points)
)
> groupedInfo
# A tibble: 6 x 5
# Groups: player1, player2 [?]
player1 player2 player3 lineup_ct total_pts
<chr> <chr> <chr> <int> <dbl>
1 Bil Nik Joe 1 6
2 Joe Tom Nik 1 2
3 Nik Joe Tom 1 12
4 Nik Tom Joe 1 36
5 Tom Bil Joe 1 8
6 Tom Nik Joe 1 3
The goal here is to group_by the 3 players in each row, and then compute some summary statistics (in this simple example, count and sum-of-points) for the different groups. Unfortunately, what dplyr::group_by is missing is the fact that certain groups of players should be the same group of players, if its the same 3 players simply in different columns.
For example, in the dataframe above, rows 3,4,5,6 all have the same 3 players (Nik, Tom, Joe), however because sometimes Nik is player1, and sometimes Nik is player2, etc., the group_by groups them separately.
For clarity, below is an example of the type of results I am seeking to get:
correctPlayerInfo = data.frame(
lineup = c(1,2,3,4,5,6),
player1 = c("Bil", "Bil", "Joe", "Joe", "Joe", "Joe"),
player1id = c("e91", "e91", "3b3", "3b3", "3b3", "3b3"),
player2 = c("Joe", "Joe", "Nik", "Nik", "Nik", "Nik"),
player2id = c("3b3", "3b3", "b17", "b17", "b17", "b17"),
player3 = c("Nik", "Tom", "Tom", "Tom", "Tom", "Tom"),
player3id = c("b17", "a27", "a27", "a27", "a27", "a27"),
points = c(6, 8, 3, 12, 36, 2),
stringsAsFactors = FALSE
)
correctGroupedInfo <- correctPlayerInfo %>%
dplyr::group_by(player1, player2, player3) %>%
dplyr::summarise(
lineup_ct = n(),
total_pts = sum(points)
)
> correctGroupedInfo
# A tibble: 3 x 5
# Groups: player1, player2 [?]
player1 player2 player3 lineup_ct total_pts
<chr> <chr> <chr> <int> <dbl>
1 Bil Joe Nik 1 6
2 Bil Joe Tom 1 8
3 Joe Nik Tom 4 53
In this second example, I have manually sorted the data alphabetically such that player1 < player2 < player3. As a result, when I do the group_by, it accurately groups rows 3-6 into a single grouping.
How can I achieve this programatically? I'm not sure if (a) re-structuring playerInfo into the column-sorted correctPlayerInfo (as I've done above(), or (b) some other approach where group_by automatically identifies that these are the same groups, is best.
I am actively working on this, and will post updates if I can come about to my own solution. Until then, any help with this is greatly appreciated!
Edit: Thus far I've tried something along these lines:
newPlayerInfo <- playerInfo %>%
dplyr::mutate(newPlayer1 = min(player1, player2, player3)) %>%
dplyr::mutate(newPlayer3 = max(player1, player2, player3))
... to no avail.
You could create group IDs that are sorted composites of the players' names (or IDs). For example:
playerInfo %>%
mutate(
group_id = purrr::pmap_chr(
.l = list(p1 = player1, p2 = player2, p3 = player3),
.f = function(p1, p2, p3) paste(sort(c(p1, p2, p3)), collapse = "_")
)
) %>%
group_by(group_id) %>%
summarise(
lineup_ct = n(),
total_pts = sum(points)
)
# A tibble: 3 x 3
group_id lineup_ct total_pts
<chr> <int> <dbl>
1 Bil_Joe_Nik 1 6
2 Bil_Joe_Tom 1 8
3 Joe_Nik_Tom 4 53

Resources