I would like to know a practical way to transform dat in the table below
dat <- data.frame('city' = c('A','A','B','C','A','B','B','C','C','C'),
'color' = c('red', 'green', 'blue', 'red', 'green', 'blue', 'green', 'blue', 'red', 'red'),
'sex' = c('M','F','F','M','F','F','F','M','F','M'))
city red green blue F M
A 1 2 0 2 1
B 0 1 2 3 0
C 3 0 1 1 3
With tidyr, you can pivot_longer and then pivot_wider:
library(tidyr)
dat %>%
pivot_longer(c(color, sex)) %>%
pivot_wider(id_cols = city, names_from = c(name, value), names_sort = TRUE,
values_from = value, values_fn = length, values_fill = 0)
# # A tibble: 3 × 6
# city color_blue color_green color_red sex_F sex_M
# <chr> <int> <int> <int> <int> <int>
# 1 A 0 2 1 2 1
# 2 B 2 1 0 3 0
# 3 C 1 0 3 1 3
With sapply, create multiple tables and bind them:
sapply(dat[2:3], \(x) as.data.frame.matrix(table(dat$city, x))) |>
do.call(what = 'cbind.data.frame')
color.blue color.green color.red sex.F sex.M
A 0 2 1 2 1
B 2 1 0 3 0
C 1 0 3 1 3
You could first convert to longer format and then count values per group and convert back to wider format using pivot_wider like this:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(cols = c(color, sex)) %>%
group_by(city) %>%
add_count(value) %>%
distinct() %>%
select(-name) %>%
pivot_wider(names_from = value, values_from = n, values_fill = 0)
#> # A tibble: 3 × 6
#> # Groups: city [3]
#> city red M green F blue
#> <chr> <int> <int> <int> <int> <int>
#> 1 A 1 1 2 2 0
#> 2 B 0 0 1 3 2
#> 3 C 3 3 0 1 1
Created on 2023-01-31 with reprex v2.0.2
Related
HAVE=data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
"STUDENT"=c(1,1,1,2,2,3,3,3,4),
"TRIMESTER"=c(1,2,3,2,3,3,4,5,4))
WANT=data.frame("TRIMESTER"=c(1,2,3,4,5),
"NEWSTUDENTS"=c(1,1,1,1,0),
"TOTALSTUDENTS"=c(1,2,3,4,4),
"NEWTEACHER"=c(1,0,1,0,0),
"TOTALTEACHER"=c(1,1,2,2,2))
I wish to convert HAVE to WANT and I wish to do it by taking the count of NEWSTUDENTS and TOTALSTUDENTS, NEWSTUDENTS is when a STUDENT appears first in the TRIMESTER and do the same for TEACHER
We may loop across the columns, create 'NEW' columns using duplicated and then do a group by sum
library(dplyr)
library(stringr)
HAVE %>%
# order by TRIMESTER
arrange(TRIMESTER) %>%
# loop across TEACHER, STUDENT, create logical columns
# by modifying the .names
mutate(across(c(TEACHER, STUDENT), ~ !duplicated(.x),
.names = "NEW{.col}")) %>%
# grouped by TRIMESTER
group_by(TRIMESTER) %>%
# get the sum of 'NEW' columns
summarise(across(starts_with("NEW"), sum), .groups = 'drop') %>%
# loop over the 'NEW' columns, get the cumulative sum
# create new columns by modifying the .names
mutate(across(starts_with('NEW'), cumsum,
.names = "{str_replace(.col, 'NEW', 'TOTAL')}"))
-output
# A tibble: 5 × 5
TRIMESTER NEWTEACHER NEWSTUDENT TOTALTEACHER TOTALSTUDENT
<dbl> <int> <int> <int> <int>
1 1 1 1 1 1
2 2 0 1 1 2
3 3 1 1 2 3
4 4 0 1 2 4
5 5 0 0 2 4
This is an extension of this other answer of mine.
suppressPackageStartupMessages(library(dplyr))
HAVE <- data.frame("TEACHER"=c(1,1,1,1,1,2,2,2,2),
"STUDENT"=c(1,1,1,2,2,3,3,3,4),
"TRIMESTER"=c(1,2,3,2,3,3,4,5,4))
WANT <- data.frame("TRIMESTER"=c(1,2,3,4,5),
"NEWSTUDENTS"=c(1,1,1,1,0),
"TOTALSTUDENTS"=c(1,2,3,4,4),
"NEWTEACHER"=c(1,0,1,0,0),
"TOTALTEACHER"=c(1,1,2,2,2))
HAVE %>%
mutate(NEWSTUDENTS = !duplicated(STUDENT),
NEWTEACHER = !duplicated(TEACHER)) %>%
group_by(TRIMESTER) %>%
summarise(NEWSTUDENTS = sum(NEWSTUDENTS),
NEWTEACHER = sum(NEWTEACHER)) %>%
ungroup() %>%
mutate(TOTALSTUDENTS = cumsum(NEWSTUDENTS),
TOTALTEACHER = cumsum(NEWTEACHER)) %>%
relocate(TOTALSTUDENTS, .before = NEWTEACHER)
#> # A tibble: 5 × 5
#> TRIMESTER NEWSTUDENTS TOTALSTUDENTS NEWTEACHER TOTALTEACHER
#> <dbl> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 1 2 0 1
#> 3 3 1 3 1 2
#> 4 4 1 4 0 2
#> 5 5 0 4 0 2
Created on 2022-08-18 by the reprex package (v2.0.1)
I want to count the use of Tool A by year and keep zeros.
ID <- c(1,1,2,2,2,3,4,5,5,5)
Tool <- c("A","B","A","B","A","A","B","A","A","A")
Year <- c(2000,2001,2001,2001,2002,2002,2001,2000,2001,2002)
df <- data.frame(ID,Tool,Year)
library(tidyverse)
df %>% group_by(ID) %>% summarise(toolA = sum(Tool == "A")) %>% count(toolA)
# A tibble: 4 x 2
toolA n
<int> <int>
1 0 1
2 1 2
3 2 1
4 3 1
I want to add year columns, so that I can have a table as below
tool A
Count
2000
2001
2002
0
1
0
0
0
1
2
1
0
1
2
1
0
1
1
3
1
1
1
1
The numbers under years means the number of use in a year.(Not a person)
How would you do?
Here is another tidyverse method. Simply speaking, we would pivot the dataframe from wide to long and then summarize. Frist summarization gets rid of all the other non-"A"s. Second summarization condenses the result table into unique bins identified by each toolA and produces a count.
library(dplyr)
library(tidyr)
df %>%
mutate(value = +(Tool == "A")) %>%
pivot_wider(names_from = Year, values_fill = 0L) %>%
group_by(ID) %>%
summarize(across(-Tool, sum)) %>%
group_by(toolA = rowSums(across(-ID))) %>%
summarize(count = n(), across(-c(ID, count), sum))
Output
# A tibble: 4 x 5
toolA count `2000` `2001` `2002`
<dbl> <int> <int> <int> <int>
1 0 1 0 0 0
2 1 2 1 0 1
3 2 1 0 1 1
4 3 1 1 1 1
Maybe this is too convoluted and a better/easier solution exists.
library(dplyr)
library(tidyr)
dataA <- df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A")) %>%
count(toolA)
df %>%
group_by(ID, Year) %>%
summarise(toolA = sum(Tool == "A"), .groups = 'drop') %>%
pivot_wider(names_from = Year, values_from = toolA, values_fill = 0) %>%
select(-ID) %>%
mutate(toolA = rowSums(.)) %>%
right_join(dataA, by = 'toolA') %>%
select(toolA, n, everything()) %>%
arrange(toolA) %>%
group_by(toolA, n) %>%
summarise(across(.fns = sum), .groups = 'drop')
# toolA n `2000` `2001` `2002`
# <dbl> <int> <int> <int> <int>
#1 0 1 0 0 0
#2 1 2 1 0 1
#3 2 1 0 1 1
#4 3 1 1 1 1
I might try this approach with tidyverse. Create a list column with the Year when grouping by ID. After including the count n as you have done, use unnest_longer to recover the years. I added an extra column for situations where count is zero called "None". A final pivot_wider would put the data into wide form again.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A"),
Years = list(Year[Tool == "A"])) %>%
add_count(toolA) %>%
unnest_longer(Years) %>%
replace_na(list(Years = "None")) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = c(toolA, n), names_from = Years, names_prefix = "Year_", values_from = value, values_fill = 0)%>%
arrange(toolA)
Output
toolA n Year_2000 Year_2001 Year_2002 Year_None
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 0 1 0 0 0 1
2 1 2 1 0 1 0
3 2 1 0 1 1 0
4 3 1 1 1 1 0
Using the following data:
df <- data.frame(id = c("A", "B", "C", "A", "B", "A"),
value = c(1, 2, 3, 4, 5, 6))
I want to pivot_wider this data so that the reshaping creates two different sets of columns:
One set where I create a bunch of binary columns that take the column names from the value columns (e.g. bin_1, bin_2 and so on) and that are coded as 0/1.
An additional set where I create as many necessary columns to store the values in a "categorical" way. Here, id "A" has three values, so I want to create three columns cat_1, cat_2, cat_3 and for IDs B and C I want to fill them up with NAs if there's no value.
Now, I know how to create these two things separately from each other and merge them afterwards via a left_join.
However, my question is: can it be done in one pipeline, where I do two subsequent pivot_widers? I tried, but it doesn't work (obviously because my way of copying the value column and then try to use one for the binary reshape and one for the categorial reshape is wrong).
Any ideas?
Code so far that works:
df1 <- df %>%
group_by(id) %>%
mutate(group_id = 1:n()) %>%
ungroup() %>%
pivot_wider(names_from = group_id,
names_prefix = "cat_",
values_from = value)
df2 <- df %>%
mutate(dummy = 1) %>%
arrange(value) %>%
pivot_wider(names_from = value,
names_prefix = "bin_",
values_from = dummy,
values_fill = list(dummy = 0),
values_fn = list(dummy = length))
df <- df1 %>%
left_join(., df2, by = "id)
Expected output:
# A tibble: 3 x 10
id cat_1 cat_2 cat_3 bin_1 bin_2 bin_3 bin_4 bin_5 bin_6
<chr> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int>
1 A 1 4 6 1 0 0 1 0 1
2 B 2 5 NA 0 1 0 0 1 0
3 C 3 NA NA 0 0 1 0 0 0
With the addition of purrr, you could do:
map(.x = reduce(range(df$value), `:`),
~ df %>%
group_by(id) %>%
mutate(!!paste0("bin_", .x) := as.numeric(.x %in% value))) %>%
reduce(full_join) %>%
mutate(cats = paste0("cat_", row_number())) %>%
pivot_wider(names_from = "cats",
values_from = "value")
id bin_1 bin_2 bin_3 bin_4 bin_5 bin_6 cat_1 cat_2 cat_3
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 0 0 1 0 1 1 4 6
2 B 0 1 0 0 1 0 2 5 NA
3 C 0 0 1 0 0 0 3 NA NA
In base you can try:
tt <- unstack(df[2:1])
x <- cbind(t(sapply(tt, "[", seq_len(max(lengths(tt))))),
t(+sapply(names(tt), "%in%", x=df$id)))
colnames(x) <- c(paste0("cat_", seq_len(max(lengths(tt)))),
paste0("bin_", seq_len(nrow(df))))
x
# cat_1 cat_2 cat_3 bin_1 bin_2 bin_3 bin_4 bin_5 bin_6
#A 1 4 6 1 0 0 1 0 1
#B 2 5 NA 0 1 0 0 1 0
#C 3 NA NA 0 0 1 0 0 0
Slightly modifying your approach by reducing df2 code and including it all in one pipe by taking advantage of the list and . trick which allows you to work on two versions of df in the same call.
Its not much of an improvement on what you have done but it is now all in one call. I can't think of way you can do it without a merge/join.
library(tidyverse)
df %>%
list(
pivot_wider(., id_cols = id,
names_from = value,
names_prefix = "bin_") %>%
mutate_if(is.numeric, ~ +(!is.na(.))), #convert to binary
group_by(., id) %>%
mutate(group_id = 1:n()) %>%
ungroup() %>%
pivot_wider(names_from = group_id,
names_prefix = "cat_",
values_from = value)
) %>%
.[c(2:3)] %>%
reduce(left_join)
# id bin_1 bin_2 bin_3 bin_4 bin_5 bin_6 cat_1 cat_2 cat_3
# <chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 A 1 0 0 1 0 1 1 4 6
# 2 B 0 1 0 0 1 0 2 5 NA
# 3 C 0 0 1 0 0 0 3 NA NA
Even you can join both your syntax into one without creating any intermediate object
df %>%
group_by(id) %>%
mutate(group_id = row_number()) %>%
pivot_wider(names_from = group_id,
names_prefix = "cat_",
values_from = value) %>% left_join(df %>% mutate(dummy = 1) %>% arrange(value) %>% pivot_wider(names_from = value,
names_prefix = "bin_",
values_from = dummy,
values_fill = list(dummy = 0),
values_fn = list(dummy = length)), by = "id")
# A tibble: 3 x 10
# Groups: id [3]
id cat_1 cat_2 cat_3 bin_1 bin_2 bin_3 bin_4 bin_5 bin_6
<chr> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int>
1 A 1 4 6 1 0 0 1 0 1
2 B 2 5 NA 0 1 0 0 1 0
3 C 3 NA NA 0 0 1 0 0 0
We can use the following data frame as an example:
Case <- c("Siddhartha", "Siddhartha", "Siddhartha", "Paul", "Paul", "Paul", "Hannah", "Herbert")
Procedure <- c("1", "1", "2", "3", "3", "4", "1", "1")
Location <- c("a", "a", "b", "a", "a", "b", "c", "a")
(df <- data.frame(Case, Procedure, Location))
Case Procedure Location
1 Siddhartha 1 a
2 Siddhartha 1 a
3 Siddhartha 2 b
4 Paul 3 a
5 Paul 3 a
6 Paul 4 b
7 Hannah 1 c
8 Herbert 1 a
Now i do the following:
df %>%
count(Location, Procedure) %>%
pivot_wider(names_from = Location, values_from = n, values_fill = list(n = 0))
which gives me:
# A tibble: 4 x 4
Procedure a b c
<fct> <int> <int> <int>
1 1 3 0 1
2 3 2 0 0
3 2 0 1 0
4 4 0 1 0
This is not exactly, what i want though. What i want is the following data frame:
# A tibble: 4 x 4
Procedure a b c
<fct> <int> <int> <int>
1 1 2 0 1
2 3 1 0 0
3 2 0 1 0
4 4 0 1 0
Notice the difference in Procedure 1 and 3.
So what i would like is a function, that counts the number of DISTINCT cases for each Procedures AND each location. Also that function should be working on varying data frames, where there are different (unknown) cases and procedures.
For the original data frame
df %>%
distinct() %>%
count(Location, Procedure) %>%
pivot_wider(names_from = Location, values_from = n, values_fill = list(n = 0))
does not work, since it is ignoring the "distinct". What works (also for the original data frame!) is the following:
df %>%
group_by(Procedure, Location) %>%
summarise(Anzahl = n_distinct(Case))
That gives me the following though:
# A tibble: 5 x 3
# Groups: Procedure [4]
Procedure Location Anzahl
<fct> <fct> <int>
1 1 a 2
2 1 c 1
3 2 a 1
4 3 b 1
5 4 b 1
But how to implement the "pivot_wider" function, so it is also sorted by location? If i try to add it, i get the following error:
"Error: This tidyselect interface doesn't support predicates yet.
i Contact the package author and suggest using eval_select()."
Also it is very confusing to me, why the solution of Ronak works for the example data frame but not for the original. I can't spot important differences in these two data frames.
Regards
You can do it with a single call to pivot_wider and take advantage of the argument values_fn, which applies a function to the values
df %>%
pivot_wider(names_from = Location,
values_from = Case,
values_fn = list(Case = n_distinct),
values_fill = list(Case = 0))
which gives,
# A tibble: 4 x 4
Procedure a b c
<fct> <int> <int> <int>
1 1 2 0 1
2 2 0 1 0
3 3 1 0 0
4 4 0 1 0
A simple fix is to add distinct or unique before counting
library(dplyr)
library(tidyr)
df %>%
distinct() %>%
count(Location, Procedure) %>%
pivot_wider(names_from = Location, values_from = n, values_fill = list(n = 0))
# A tibble: 4 x 4
# Procedure a b c
# <chr> <int> <int> <int>
#1 1 2 0 1
#2 3 1 0 0
#3 2 0 1 0
#4 4 0 1 0
For OP's data they need :
df %>%
group_by(Procedure, Location) %>%
summarise(Anzahl = n_distinct(Case)) %>%
pivot_wider(names_from = Location, values_from = Anzahl,
values_fill = list(Anzahl = 0))
Hi all I have a got a 2 datasets below. From these 2 datasets(dataset1 is formed from dataset2. I mean the dataset1 is the count of users from dataset2) can we build the the third datasets(expected output)
dataset1
Apps # user Enteries
A 3
B 4
C 6
dataset2
Apps Users
A X
A Y
A Z
B Y
B Y
B Z
B A
C X
C X
C X
C X
C X
C X
Expected output
Apps Entries X Y Z A
A 3 1 1 1
B 4 2 1 1
C 6 6
We can first count first for Apps and Users, get the data in wide format and join with the table for count of Apps.
library(dplyr)
df %>%
count(Apps, Users) %>%
tidyr::pivot_wider(names_from = Users, values_from = n,
values_fill = list(n = 0)) %>%
left_join(df %>% count(Apps), by = 'Apps')
# Apps X Y Z A n
# <chr> <int> <int> <int> <int> <int>
#1 A 1 1 1 0 3
#2 B 0 2 1 1 4
#3 C 6 0 0 0 6
I showing 0 is no problem and having a different column order you can use table and rowSums to produce the expected output.
x <- table(dataset2)
cbind(Entries=rowSums(x), x)
# Entries A X Y Z
#A 3 0 1 1 1
#B 4 1 0 2 1
#C 6 0 6 0 0
A solution where you need not have to calculate Total separately and do joins...
This solution uses purrr::pmap and dplyr::mutate for dynamically calculating Total.
library(tidyverse) # dplyr, tidyr, purrr
df %>% count(Apps, Users) %>%
pivot_wider(id_cols = Apps, names_from = Users, values_from = n, values_fill = list(n = 0)) %>%
mutate(Total = pmap_int(.l = select_if(., is.numeric),
.f = sum))
which have output what you need
# A tibble: 3 x 6
Apps X Y Z A Total
<chr> <int> <int> <int> <int> <int>
1 A 1 1 1 0 3
2 B 0 2 1 1 4
3 C 6 0 0 0 6