I have a data frame that looks like the following, with year and ID identifiers, as well as many categorical variables (values denoted with capital letters below):
Year ID Var1 Var2 Var3 ...
1996 1 A A B
1996 1 A A C
1996 2 B A D
1998 2 C C A
2000 3 D D D
My goal is to reshape this into wide format by ID, but also giving counts for ID, year, and value. So, for example:
ID Var1_1996_A Var1_1996_B Var1_1996_C Var1_1996_D ...
1 2 0 0 0
2 0 1 0 0
3 0 0 0 0
And so on, for each variable. I'm relatively new to R and couldn't quite find a similar operation from existing posts (apologies if this is duplicate). Would anyone know what the best way to accomplish this would be? I have tried using tidyr::pivot_wider, but can only figure out how to append the years, but not create separate categories for each variable response
df <- df %>%
pivot_wider(names_from = year,
values_from (Var1, Var2, Var3, Var4, Var5)
If anyone could offer some insight that would be greatly appreciated.
Get the data in long format first :
library(tidyr)
df %>%
pivot_longer(cols = starts_with('Var')) %>%
pivot_wider(names_from = c(name, Year, value), values_from = name,
values_fn = length, values_fill = 0)
# ID Var1_1996_A Var2_1996_A Var3_1996_B Var3_1996_C Var1_1996_B Var3_1996_D
# <int> <int> <int> <int> <int> <int> <int>
#1 1 2 2 1 1 0 0
#2 2 0 1 0 0 1 1
#3 3 0 0 0 0 0 0
# … with 6 more variables: Var1_1998_C <int>, Var2_1998_C <int>,
# Var3_1998_A <int>, Var1_2000_D <int>, Var2_2000_D <int>, Var3_2000_D <int>
data
df <- structure(list(Year = c(1996L, 1996L, 1996L, 1998L, 2000L), ID = c(1L,
1L, 2L, 2L, 3L), Var1 = c("A", "A", "B", "C", "D"), Var2 = c("A",
"A", "A", "C", "D"), Var3 = c("B", "C", "D", "A", "D")),
class = "data.frame", row.names = c(NA, -5L))
If you will be using base R:
xtabs(~ID+v, transform(cbind(df[1:2], stack(df, -(1:2))), v = paste(ind, Year, values, sep="_")))
v
ID Var1_1996_A Var1_1996_B Var1_1998_C Var1_2000_D Var2_1996_A Var2_1998_C Var2_2000_D Var3_1996_B Var3_1996_C Var3_1996_D Var3_1998_A Var3_2000_D
1 2 0 0 0 2 0 0 1 1 0 0 0
2 0 1 1 0 1 1 0 0 0 1 1 0
3 0 0 0 1 0 0 1 0 0 0 0 1
Of course to transform it to data.frame you could use: as.data.frame.matrix(...)
Related
I am searching a way for obtaining a large table from a table with a list of species.
Here I give you an example:
I have something like this
data.frame(survey_id = c("ID_1", "ID_2", "ID_3", "ID_4", "ID_5"),
list_1 = c("A", "A", "A", "B", "A"),
list_2 = c("B", "D", "E", "E", "F"),
list_3 = c("C", "", "", "F", ""))
and I want to obtain this
data.frame(survey_id = c("ID_1", "ID_2", "ID_3", "ID_4", "ID_5"),
A = c(1,1,1,0,1),
B = c(1,0,0,1,0),
C = c(1,0,0,0,0),
D = c(0,1,0,0,0),
E = c(0,0,1,1,0),
F = c(0,0,0,1,1))
Any suggestion using pivot_wider function?
Thanks for your help.
A slightly round-a-bout way: pivot_longer first, so as you can count by (survey_id, value), and then pivot_wider.
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(-Survey_ID) %>%
count(Survey_ID, value) %>%
filter(value != "") %>%
pivot_wider(names_from = "value",
values_from = "n") %>%
replace(is.na(.), 0)
Result:
# A tibble: 5 × 7
Survey_ID A B C D E F
<chr> <int> <int> <int> <int> <int> <int>
1 ID_1 1 1 1 0 0 0
2 ID_2 1 0 0 1 0 0
3 ID_3 1 0 0 0 1 0
4 ID_4 0 1 0 0 1 1
5 ID_5 1 0 0 0 0 1
Data df1:
df1 <- data.frame(survey_id = c("ID_1", "ID_2", "ID_3", "ID_4", "ID_5"),
list_1 = c("A", "A", "A", "B", "A"),
list_2 = c("B", "D", "E", "E", "F"),
list_3 = c("C", "", "", "F", ""))
df %>%
pivot_longer(-survay_id) %>% # first go longer
mutate(n=1) %>% # add a value to each record
select(-name) %>% # drop the name column
filter(value != "") %>% # remove empty cells caused by pivot_longer
# spread(survay_id, n, fill = 0) # a spread solution
pivot_wider(names_from=survay_id, values_from=n, values_fill=0) # a pivot_wider solution
value ID_1 ID_2 ID_3 ID_4 ID_5
1 A 1 1 1 0 1
2 B 1 0 0 1 0
3 C 1 0 0 0 0
4 D 0 1 0 0 0
5 E 0 0 1 1 0
6 F 0 0 0 1 1
I am working with a data frame like the following, where Color and `Player are factor variables:
I want to create indicator variables for each value of the color column. However, I want those indicator variables to represent whether the color is present for other players in the same game (not whether it's present for that player). So I want the above table to turn into:
I imagine the code will have group_by(Game) %>%, but I'm lost beyond that.
Data:
structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L,
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green",
"Purple", "Yellow"), Blue = c(1L, 1L, 0L, 0L, 0L, 0L), Green = c(1L,
0L, 1L, 0L, 1L, 1L), Yellow = c(0L, 0L, 0L, 1L, 1L, 0L), Red = c(0L,
1L, 1L, 0L, 0L, 0L), Purple = c(0L, 0L, 0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-6L))
Perhaps this helps - split the 'Color' column by 'Game', create a binary matrix by comparing the elements of 'Color' (!=), convert to tibble, row bind (_dfr) and bind the dataset with the original dataset (bind_cols)
library(purrr)
library(dplyr)
library(tidyr)
map_dfr(split(df1$Color, df1$Game), ~ {
m1 <- +(outer(.x, .x, FUN = `!=`))
colnames(m1) <- .x
as_tibble(m1)}) %>%
mutate(across(everything(), replace_na, 0)) %>%
bind_cols(df1, .)
-output
Game Player Color Red Green Blue Purple Yellow
1 A 1 Red 0 1 1 0 0
2 A 2 Green 1 0 1 0 0
3 A 3 Blue 1 1 0 0 0
4 B 1 Green 0 0 0 1 1
5 B 2 Purple 0 1 0 0 1
6 B 3 Yellow 0 1 0 1 0
Or another option is with dummy_cols and then modify the output
library(fastDummies)
library(stringr)
dummy_cols(df1, 'Color') %>%
rename_with(~ str_remove(.x, "Color_")) %>%
group_by(Game) %>%
mutate(across(Blue:Yellow, ~ +(Color != cur_column() & any(.x)))) %>%
ungroup
-output
# A tibble: 6 × 8
Game Player Color Blue Green Purple Red Yellow
<chr> <int> <chr> <int> <int> <int> <int> <int>
1 A 1 Red 1 1 0 0 0
2 A 2 Green 1 0 0 1 0
3 A 3 Blue 0 1 0 1 0
4 B 1 Green 0 0 1 0 1
5 B 2 Purple 0 1 0 0 1
6 B 3 Yellow 0 1 1 0 0
data
df1 <- structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L,
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green",
"Purple", "Yellow")), row.names = c(NA, -6L), class = "data.frame")
Here is a way how we could do it:
First we use model.matrix() fucntion multiply it by 1 and substract 1 within a wrap of abs().
Then we get almost the desired output, the only thing that is left is the get zeros in case if non of the colors is present. We do this with a mutate across...:
library(dplyr)
df %>%
cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>%
group_by(Game) %>%
mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0,
TRUE ~ .)))
Game Player Color ColorBlue ColorGreen ColorPurple ColorRed ColorYellow
<chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 Red 1 1 0 0 0
2 A 2 Green 1 0 0 1 0
3 A 3 Blue 0 1 0 1 0
4 B 1 Green 0 0 1 0 1
5 B 2 Purple 0 1 0 0 1
6 B 3 Yellow 0 1 1 0 0
>
Here is another approach using full_join and pivot_wider from tidyverse. I believe this also gives the same result. The filter is included to avoid same color indicators as 1.
library(tidyverse)
full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
filter(Color != Color_Two) %>%
mutate(val = 1) %>%
pivot_wider(id_cols = c(Game, Player, Color),
names_from = Color_Two,
values_from = val,
values_fill = 0)
Output
Game Player Color Green Blue Red Purple Yellow
<chr> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 Red 1 1 0 0 0
2 A 2 Green 0 1 1 0 0
3 A 3 Blue 1 0 1 0 0
4 B 1 Green 0 0 0 1 1
5 B 2 Purple 1 0 0 0 1
6 B 3 Yellow 1 0 0 1 0
Using base R, you can write a small function and evaluate using tapply:
fun <- function(x) {
nms <- levels(x)
tab <- tcrossprod(table(x))
dimnames(tab) <- list(nms, nms)
tab[x, ]
}
data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
Game Player Color Blue Green Purple Red Yellow
1 A 1 Red 1 1 0 1 0
2 A 2 Green 1 1 0 1 0
3 A 3 Blue 1 1 0 1 0
4 B 1 Green 0 1 1 0 1
5 B 2 Purple 0 1 1 0 1
6 B 3 Yellow 0 1 1 0 1
Note that out of all the options given, This is by far the fastest, yet only using base R:
Here is the benchmark:
library(microbenchmark)
microbenchmark(Tarjae(df1), akrun(df1), ben(df1), onyambu(df1),
paulS(df1), unit = 'relative')
Unit: relative
expr min lq mean median uq max neval
Tarjae(df1) 18.775201 18.11495 13.533556 17.171485 15.746554 1.105045 100
akrun(df1) 9.755032 8.83519 7.137294 8.756033 8.241494 1.455906 100
ben(df1) 21.084371 18.57861 14.699821 17.950987 16.486863 3.124906 100
onyambu(df1) 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 100
paulS(df1) 33.108208 31.27110 24.918541 30.266024 27.420363 3.156215 100
For larger dataframes, some of the given code breaks down, while those that dont break down are still slow to the base R approach:
df2<- transform(data.frame(Game = sample(LETTERS, 2000, TRUE), Color = sample(colors(), 2000, TRUE)), Player = ave(Game, Game, FUN=seq_along))
microbenchmark(Tarjae(df2), akrun(df2), onyambu(df2), paulS(df2))
Unit: milliseconds
expr min lq mean median uq max neval
Tarjae(df2) 2147.67826 2234.5575 2460.1924 2423.20994 2653.1737 3049.9455 100
akrun(df2) 108.25249 121.3167 144.6715 130.48052 152.9518 404.7286 100
onyambu(df2) 67.19992 80.3653 111.2821 91.05784 118.4877 331.6724 100
paulS(df2) 183.88836 200.6224 231.0155 215.18942 237.5717 467.1721 100
Code for the benchmark:
Tarjae <- function(df){
df %>%
cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>%
group_by(Game) %>%
mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0,
TRUE ~ .)))
}
akrun <- function(df1){
map_dfr(split(df1$Color, df1$Game), ~ {
m1 <- +(outer(.x, .x, FUN = `!=`))
colnames(m1) <- .x
as_tibble(m1)}) %>%
mutate(across(everything(), replace_na, 0)) %>%
bind_cols(df1, .)
}
ben <- function(df){
full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
filter(Color != Color_Two) %>%
mutate(val = 1) %>%
pivot_wider(id_cols = c(Game, Player, Color),
names_from = Color_Two,
values_from = val,
values_fill = 0)
}
onyambu <- function(df1){
fun <- function(x) {
nms <- levels(x)
tab <- tcrossprod(table(x))
dimnames(tab) <- list(nms, nms)
tab[x, ]
}
data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
}
paulS <- function(df){
df %>%
group_by(Game) %>%
mutate(aux = list(Color)) %>%
unnest(aux) %>%
filter(aux != Color) %>%
ungroup %>%
pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0,
values_fn = length)
}
Another possible solution:
library(tidyverse)
df %>%
group_by(Game) %>%
mutate(aux = list(Color)) %>%
unnest(aux) %>%
filter(aux != Color) %>%
ungroup %>%
pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0,
values_fn = length)
#> # A tibble: 6 × 8
#> Game Player Color Green Blue Red Purple Yellow
#> <chr> <int> <chr> <int> <int> <int> <int> <int>
#> 1 A 1 Red 1 1 0 0 0
#> 2 A 2 Green 0 1 1 0 0
#> 3 A 3 Blue 1 0 1 0 0
#> 4 B 1 Green 0 0 0 1 1
#> 5 B 2 Purple 1 0 0 0 1
#> 6 B 3 Yellow 1 0 0 1 0
I have an R data frame that has an ID column with multiple records for an ID. When the flag is set to 1 for an ID, I want to create a column new timeline that starts from 1 and increases sequentially in increments of 6 (1,6,12...). How can I achieve this in R using dplyr ?
Below is a sample data frame
ID
Timepoint
Flag
A
0
0
A
6
0
A
12
0
A
18
1
A
24
0
A
30
0
A
36
0
Expected Dataframe
ID
Timepoint
Flag
New_Timepoint
A
0
0
A
6
0
A
12
0
A
18
1
1
A
24
0
6
A
30
0
12
A
36
0
18
An option is to group by 'ID', create the lag of the 'Timepoint' with n specified as the position of 'Flag' where the value is 1 (-1)
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(New_Timepoint = dplyr::lag(replace(Timepoint, !Timepoint, 1),
n = which(Flag == 1)-1)) %>%
ungroup
-output
# A tibble: 7 x 4
# ID Timepoint Flag New_Timepoint
# <chr> <int> <int> <dbl>
#1 A 0 0 NA
#2 A 6 0 NA
#3 A 12 0 NA
#4 A 18 1 1
#5 A 24 0 6
#6 A 30 0 12
#7 A 36 0 18
Or use a double cumsum to create the index
df1 %>%
group_by(ID) %>%
mutate(New_Timepoint = Timepoint[na_if(cumsum(cumsum(Flag)), 0)]) %>%
ungroup
data
df1 <- structure(list(ID = c("A", "A", "A", "A", "A", "A", "A"),
Timepoint = c(0L,
6L, 12L, 18L, 24L, 30L, 36L),
Flag = c(0L, 0L, 0L, 1L, 0L, 0L,
0L)), class = "data.frame", row.names = c(NA, -7L))
Another dplyr option
df %>%
group_by(ID) %>%
mutate(New_Timepoint = pmax(1, Timepoint - c(NA, Timepoint[Flag == 1])[cumsum(Flag) + 1])) %>%
ungroup()
gives
ID Timepoint Flag New_Timepoint
<chr> <int> <int> <dbl>
1 A 0 0 NA
2 A 6 0 NA
3 A 12 0 NA
4 A 18 1 1
5 A 24 0 6
6 A 30 0 12
7 A 36 0 18
Solution
I went with the solutions provided by #MauritsEvers and #akrun below.
Question
For a data frame, I want to keep only 1 column of each set of duplicate columns. In addition, the column that is kept takes on a name that is a concatenation of all column names in the set of duplicate columns. There are multiple sets of duplicate columns in the data frame. The data frame contains tens of thousands of columns, so using a for loop might take too much time.
I have tried a combination of using the duplicate(), summary(), aggregate(), lapply(), apply(), and using for loops.
Input data frame (df_in):
0 1 2 3 4 5 6 7
0 1 0 0 1 0 1 1
0 1 0 1 1 0 0 0
1 0 1 0 0 1 1 0
Output data frame (df_out):
0-2-5 1-4 3 6 7
0 1 0 1 1
0 1 1 0 0
1 0 0 1 0
Here is an option with tidyverse. We gather the data into 'long' format, conver the 'value' into a string, grouped by 'value', paste the 'key' column together, separate the rows of 'value' and then spread the 'value' column to get the expected output
library(tidyverse)
gather(df_in) %>%
group_by(key) %>%
summarise(value = toString(value)) %>%
group_by(value) %>%
summarise(key = paste(key, collapse="-")) %>%
separate_rows(value) %>%
group_by(key) %>%
mutate(n = row_number()) %>%
spread(key, value) %>%
select(-n)
# A tibble: 3 x 5
# `0-2-5` `1-4` `3` `6` `7`
# <chr> <chr> <chr> <chr> <chr>
#1 0 1 0 1 1
#2 0 1 1 0 0
#3 1 0 0 1 0
Or another option with tidyverse would be
t(df_in) %>%
as.data.frame %>%
mutate(grp = group_indices(., V1, V2, V3)) %>%
mutate(rn = row_number() - 1) %>%
group_split(grp, keep = FALSE) %>%
map_dfc(~ .x %>%
mutate(rn = str_c(rn, collapse="-")) %>%
slice(1) %>%
gather(key, val, -rn) %>%
rename(!! .$rn[1] := val) %>%
select(ncol(.)))
# A tibble: 3 x 5
# `0-2-5` `3` `7` `6` `1-4`
# <int> <int> <int> <int> <int>
#1 0 0 1 1 1
#2 0 1 0 0 1
#3 1 0 0 1 0
Or we can also do this with data.table methods
library(data.table)
dcast(melt(as.data.table(t(df_in))[, grp := .GRP, .(V1, V2, V3)][,
c(.SD[1], cn = paste(.I-1, collapse="-")) , .(grp)],
id.var = c('cn', 'grp')), variable ~ cn, value.var = 'value')[,
variable := NULL][]
# 0-2-5 1-4 3 6 7
#1: 0 1 0 1 1
#2: 0 1 1 0 0
#3: 1 0 0 1 0
data
df_in <- structure(list(`0` = c(0L, 0L, 1L), `1` = c(1L, 1L, 0L), `2` = c(0L,
0L, 1L), `3` = c(0L, 1L, 0L), `4` = c(1L, 1L, 0L), `5` = c(0L,
0L, 1L), `6` = c(1L, 0L, 1L), `7` = c(1L, 0L, 0L)),
class = "data.frame", row.names = c(NA, -3L))
You can do the following in base R
Get indices of identical columns
idx <- split(seq_along(names(df)), apply(df, 2, paste, collapse = "_"))
Sort indices from low to high
idx <- idx[order(sapply(idx, function(x) x[1]))]
Names of idx as concatentation of column names
names(idx) <- sapply(idx, function(x) paste(names(df)[x], collapse = "_"))
Create final matrix
sapply(idx, function(x) df[, x[1]])
# col0_col2_col5 col1_col4 col3_col6 col7
#[1,] 0 1 1 1
#[2,] 0 1 0 0
#[3,] 1 0 1 0
Note that the resulting object is a matrix, so if you need a data.frame simply cast as.data.frame.
Sample data
I've changed your sample data slightly to not have numbers as column names.
df <- read.table(text =
"col0 col1 col2 col3 col4 col5 col6 col7
0 1 0 1 1 0 1 1
0 1 0 0 1 0 0 0
1 0 1 1 0 1 1 0", header = T)
This question already has answers here:
Faster ways to calculate frequencies and cast from long to wide
(4 answers)
Closed 4 years ago.
I need help with converting my long data of dimension 1558810 x 84 to a wide data of 1558810 x 4784
Let me explain in detail how and why. My raw data is as follows -
The data has three main columns -
id empId dept
1 a social
2 a Hist
3 a math
4 b comp
5 a social
6 b comp
7 c math
8 c Hist
9 b math
10 a comp
id is the unique key that tells which employee went to which department in a university on a day. I need this to be transformed as below.
id empId dept social Hist math comp
1 a social 1 0 0 0
2 a Hist 0 1 0 0
3 a math 0 0 1 0
4 b comp 0 0 0 1
5 a social 1 0 0 0
6 b comp 0 0 0 1
7 c math 0 0 1 0
8 c Hist 0 1 0 0
9 b math 0 0 1 0
10 a comp 0 0 0 1
I have two datasets one with 49k rows and one with 1.55million rows. For the smaller dataset which had 1100 unique department values, I used dcast in the reshape2 package to get the desired dataset(thus, transformed data would have 3+1100 columns and 49k rows). But when I use the same function on my larger dataset that has 4700 unique department values, my R crashes because of Memory issue. I tried varous other alternative like xtabs, reshape etc. but every time it failed with Memory error.
I have now resorted to a crude FOR loop for this purpose -
columns <- unique(ds$dept)
for(i in 1:length(unique(ds$dept)))
{
ds[,columns[i]] <- ifelse(ds$dept==columns[i],1,0)
}
But this is extremely slow and the code has been running for 10 hrs now. Is there any workaround for this, that I am missing?
ANy suggestions would be of great help!
You could try
df$dept <- factor(df$dept, levels=unique(df$dept))
res <- cbind(df, model.matrix(~ 0+dept, df))
colnames(res) <- gsub("dept(?=[A-Za-z])", "", colnames(res), perl=TRUE)
res
# id empId dept social Hist math comp
#1 1 a social 1 0 0 0
#2 2 a Hist 0 1 0 0
#3 3 a math 0 0 1 0
#4 4 b comp 0 0 0 1
#5 5 a social 1 0 0 0
#6 6 b comp 0 0 0 1
#7 7 c math 0 0 1 0
#8 8 c Hist 0 1 0 0
#9 9 b math 0 0 1 0
#10 10 a comp 0 0 0 1
Or you could try
cbind(df, as.data.frame.matrix(table(df[,c(1,3)])))
Or using data.table
library(data.table)
setDT(df)
dcast.data.table(df, id + empId + dept ~ dept, fun=length)
Or using qdap
library(qdap)
cbind(df, as.wfm(with(df, mtabulate(setNames(dept, id)))))
data
df <- structure(list(id = 1:10, empId = c("a", "a", "a", "b", "a",
"b", "c", "c", "b", "a"), dept = c("social", "Hist", "math",
"comp", "social", "comp", "math", "Hist", "math", "comp")), .Names = c("id",
"empId", "dept"), class = "data.frame", row.names = c(NA, -10L))
Try:
> cbind(dd[1:3], dcast(dd, dd$id~dd$dept, length)[-1])
Using dept as value column: use value.var to override.
id empId dept comp Hist math social
1 1 a social 0 0 0 1
2 2 a Hist 0 1 0 0
3 3 a math 0 0 1 0
4 4 b comp 1 0 0 0
5 5 a social 0 0 0 1
6 6 b comp 1 0 0 0
7 7 c math 0 0 1 0
8 8 c Hist 0 1 0 0
9 9 b math 0 0 1 0
10 10 a comp 1 0 0 0
data:
> dput(dd)
structure(list(id = 1:10, empId = structure(c(1L, 1L, 1L, 2L,
1L, 2L, 3L, 3L, 2L, 1L), .Label = c("a", "b", "c"), class = "factor"),
dept = structure(c(4L, 2L, 3L, 1L, 4L, 1L, 3L, 2L, 3L, 1L
), .Label = c("comp", "Hist", "math", "social"), class = "factor")), .Names = c("id",
"empId", "dept"), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))