Show duplicate value on a separate row in pivot wider - r

I have seen tons of answers but could not get it right. Basically I want to show duplciate on separate row while performing pivot wider.I created a unique variable as well but the result was either nested row or separate row for each column.
df <- structure(list(identifier = c("e1", "e1", "e2", "e2", "e1", "e1",
"e1", "e1", "e2", "e2"), label = c("Monaco", "became", "the",
"first", "the", "the", "Monaco", "became", "the", "first"), id = c("CP1",
"CP1", "CP1", "CP1", "CP1", "CP1", "CP2", "CP2", "CP2", "CP2"
), value = c(0L, 0L, 1L, 0L, 10L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-10L))
library(tidyverse)
#My try
df %>%
group_by(identifier,label) %>%
mutate(rn=row_number()) %>%
pivot_wider( names_from="id",
values_from="value")

library(data.table)
library(tidyr)
unnest( dcast(setDT(df), identifier + label ~ id, value.var = "value",
fill = NA, fun.aggregate = list), cols = c("CP1", "CP2"))
# # A tibble: 6 x 4
# identifier label CP1 CP2
# <chr> <chr> <int> <int>
# 1 e1 Monaco 0 1
# 2 e1 became 0 0
# 3 e1 the 10 NA
# 4 e1 the 1 NA
# 5 e2 first 0 0
# 6 e2 the 1 1

You can use -
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from=id,values_from=value, values_fn = list) %>%
unnest(cols = c(CP1, CP2))
# identifier label CP1 CP2
# <chr> <chr> <int> <int>
#1 e1 Monaco 0 1
#2 e1 became 0 0
#3 e2 the 1 1
#4 e2 first 0 0
#5 e1 the 10 NA
#6 e1 the 1 NA
You were close with your attempt as well, you had to include id in group_by -
df %>%
group_by(identifier,label, id) %>%
mutate(rn=row_number()) %>%
pivot_wider(names_from=id,values_from=value)

Related

Grouping into desired number of groups

I have a data frame like this:
ID is the primary key and Apples is the number of apples that person has.
ID
Apples
E1
10
E2
5
E3
NA
E4
5
E5
8
E6
12
E7
NA
E8
4
E9
NA
E10
8
I want to group NA and non-NA values into only 2 separate groups and get the count of each. I tried the normal group_by(), but it does not give me desired output.
Fruits %>% group_by(Apples) %>% summarize(n())
Apples n()
<dbl> <int>
4 1
5 2
8 2
10 1
12 1
NA 3
My desired output:
Apples n()
<dbl> <int>
non-NA 7
NA 3
We can create a group for NA and non-NA using group_by, and we can also make it a factor so that we can change the labels in the same step. Then, get the number of observations for each group.
library(dplyr)
df %>%
group_by(grp = factor(is.na(Apples), labels=c("non-NA", "NA"))) %>%
summarise(`n()`= n())
# grp `n()`
# <fct> <int>
#1 non-NA 7
#2 NA 3
Or in base R, we could use colSums:
data.frame(Apples = c("non-NA", "NA"), n = c(colSums(!is.na(df))[2], colSums(is.na(df))[2]), row.names = NULL)
Data
df <- structure(list(ID = c("E1", "E2", "E3", "E4", "E5", "E6", "E7",
"E8", "E9", "E10"), Apples = c(10L, 5L, NA, 5L, 8L, 12L, NA,
4L, NA, 8L)), class = "data.frame", row.names = c(NA, -10L))
In base R, this can be done with table on a logical vector
table(!is.na(df1$Apples))

How to reshape a complicated data frame in R?

I have a dataframe that is complicated and Im trying to reshape it.
Here is an example of the type of data frame that I have:
names <- c("var1", 'var2', "split")
values <- rnorm(8)
from <- data.frame(a = rep(1, 10),
b = c(rep(1,3), rep(2, 7)),
c = c(names, names, rep("split", 4)),
d = c(rep("NA", 5), names, rep("split", 2)),
e = c(rep("NA", 7), names),
f = c(values[1:2], "NA", values[3:8], "NA"))
And this produces something that looks like this:
> from
a b c d e f
1 1 1 var1 NA NA -0.271930473373158
2 1 1 var2 NA NA -0.0968100775823158
3 1 1 split NA NA NA
4 1 2 var1 NA NA -1.73919094720254
5 1 2 var2 NA NA -0.52398152119997
6 1 2 split var1 NA 0.856367467674763
7 1 2 split var2 NA -0.729762707907525
8 1 2 split split var1 0.561460771889416
9 1 2 split split var2 0.0432022687633195
10 1 2 split split split NA
Inside my data frame from, I want to take var1 and var2 and turn them into columns. And then use the value from column f in from as the values that correspond to var1 and var2 (reading row-wise).
In other words, I am trying to reshape this data frame into something that looks like this:
> out
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
Any suggestions as to how I could do this?
We could reshape to 'long' with pivot_longer, remove the NA elements and filter by keeping on the 'var' elements and then back to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
from %>%
type.convert(as.is = TRUE) %>%
pivot_longer(cols = c:e, values_drop_na = TRUE) %>%
filter(str_detect(value, 'var')) %>%
select(-name) %>%
mutate(rn = rowid(a, b, value)) %>%
pivot_wider(names_from = value, values_from = f) %>%
select(-rn)
-output
# A tibble: 4 × 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
data
from <- structure(list(a = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
b = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), c = c("var1",
"var2", "split", "var1", "var2", "split", "split", "split",
"split", "split"), d = c("NA", "NA", "NA", "NA", "NA", "var1",
"var2", "split", "split", "split"), e = c("NA", "NA", "NA",
"NA", "NA", "NA", "NA", "var1", "var2", "split"), f = c("-0.271930473373158",
"-0.0968100775823158", "NA", "-1.73919094720254", "-0.52398152119997",
"0.856367467674763", "-0.729762707907525", "0.561460771889416",
"0.0432022687633195", "NA")), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
Here is a solution with one time pivoting:
library(dplyr)
library(tidyr)
library(stringr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
mutate(name = str_extract_all(paste(c,d,e), 'var(.)')) %>%
select(a, b, f, name) %>%
pivot_wider(
names_from = name,
values_from = f,
values_fn = list
) %>%
unnest(cols = c(var1, var2))
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
This can be achieved by coupling a series of logical operations to get the values in from$f
data.frame( a=from$a[rowSums(from == "var1", na.rm=T) == 1],
b=from$b[rowSums(from == "var1", na.rm=T) == 1],
var1=from$f[rowSums(from == "var1", na.rm=T) == 1],
var2=from$f[rowSums(from == "var2", na.rm=T) == 1] )
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
The notion is to have a row_number mutation:
library(dplyr)
library(tidyr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
group_by(name = invoke(coalesce, across(c:e, na_if, 'split')))%>%
mutate(id = row_number()) %>%
pivot_wider(c(a, b, id), values_from = f) %>%
select(-id)
# A tibble: 4 x 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432

Add a new column with sum of count to a dataframe according to informations from another in R

I would need help in order to add count column into a table called tab1 according to another tab2.
Here is the first tab :
tab1
Event_Groups Other_column
1 1_G1,2_G2 A
2 2_G1 B
3 4_G4 C
4 7_G5,8_G5,9_G5 D
as you can see in Event_Groups column I have 2 information (Event and Groups numbers separated by a "_"). These informations will also be found in tab2$Group and tab2$Event and the idea is for each element within rows in tab1 (separated by a comma) , to count the number of rows within tab2 where VALUE1 < 10 AND VALUE2 > 30 and then add this count into tab1 in a new column called Sum_count.
Here is the
tab2
Group Event VALUE1 VALUE2
1 G1 1 5 50 <- VALUE1 < 10 & VALUE2 > 30 : count 1
2 G1 2 6 20 <- VALUE2 < 30 : count 0
3 G2 2 50 50 <- VALUE1 > 10 : count 0
4 G3 3 0 0
5 G4 1 0 0
6 G4 4 2 40 <- VALUE1 < 10 & VALUE2 > 30 : count 1
7 G5 7 1 70 <- VALUE1 < 10 & VALUE2 > 30 : count 1
8 G5 8 4 67 <- VALUE1 < 10 & VALUE2 > 30 : count 1
9 G5 9 3 60 <- VALUE1 < 10 & VALUE2 > 30 : count 1
Example :
For instance for the first element of row1 in tab1: 1_G1
we see in tab2 (row1) that VALUE1 < 10 & VALUE2 > 30, so I count 1.
For the seconde element (row1) : 2_G2 we see in tab2 (row3) that VALUE1 > 10, so I count 0.
And here is the expected result tab1 dataframe;
Event_Groups Other_column Sum_count
1_G1,2_G2 A 1
2_G1 B 0
4_G4 C 1
7_G5,8_G5,9_G5 D 3
I dot not know if I am clear enough, do not hesitate to ask questions.
Here are the two tables in dput format if it can helps:
tab1
structure(list(Event_Groups = structure(1:4, .Label = c("1_G1,2_G2",
"2_G1", "4_G4", "7_G5,8_G5,9_G5"), class = "factor"), Other_column =
structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor")),
class = "data.frame", row.names = c(NA,
-4L))
tab2
structure(list(Group = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 5L,
5L, 5L), .Label = c("G1", "G2", "G3", "G4", "G5"), class = "factor"),
Event = c(1L, 2L, 2L, 3L, 1L, 4L, 7L, 8L, 9L), VALUE1 = c(5L,
6L, 50L, 0L, 0L, 2L, 1L, 4L, 3L), VALUE2 = c(50, 20, 50,
0, 0, 40, 70, 67, 60)), class = "data.frame", row.names = c(NA,
-9L))
Here is one way to do it:
library(dplyr)
library(tidyr)
tab1 %>%
mutate(Event_Groups = as.character(Event_Groups)) %>%
separate_rows(Event_Groups, sep = ",") %>%
left_join(.,
tab2 %>%
unite(col = "Event_Groups", Event, Group) %>%
mutate(count = if_else(VALUE1 < 10 & VALUE2 > 30,1L, 0L))) %>%
group_by(Other_column) %>%
summarise(Event_Groups = paste(unique(Event_Groups), collapse = ","),
Sum_count = sum(count)) %>%
select(Event_Groups, everything())
#> Joining, by = "Event_Groups"
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 4 x 3
#> Event_Groups Other_column Sum_count
#> <chr> <fct> <int>
#> 1 1_G1,2_G2 A 1
#> 2 2_G1 B 0
#> 3 4_G4 C 1
#> 4 7_G5,8_G5,9_G5 D 3
Created on 2021-07-29 by the reprex package (v0.3.0)
You can try a tidyverse
library(tidyverse)
tab1 %>%
rownames_to_column() %>%
separate_rows(Event_Groups, sep = ",") %>%
separate(Event_Groups, into = c("Event", "Group"), sep="_", convert = T) %>%
left_join(tab2 %>%
mutate(count = as.numeric(VALUE1 < 10 & VALUE2 > 30)),
by = c("Event", "Group")) %>%
unite(Event_Groups, Event, Group) %>%
group_by(rowname) %>%
summarise(Event_Groups = toString(Event_Groups),
Other_column = unique(Other_column),
count =sum(count))
# A tibble: 4 x 4
rowname Event_Groups Other_column count
<chr> <chr> <chr> <dbl>
1 1 1_G1, 2_G2 A 1
2 2 2_G1 B 0
3 3 4_G4 C 1
4 4 7_G5, 8_G5, 9_G5 D 3

Coalescing multiple columns from both the left and right side

Given the following data
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
I'm trying to coalesce all columns sharing the same initial prefix name (i.e. coalesce alpha_1, alpha_2, alpha_3, alpha_4, and coalesce beta_1 beta_2, etc.), but from both the left and right sides. That is, I want to generate two new variables, say 'alpha_left' and 'alpha_right', whose columns would be, in this example, (2, 2, 3) and (3, 4, 2) respectively (first non-missing elements from the left and right side of the dataframe).
User #akrun offered a great solution for the coalescing part here, but I'm unsure how to create two new variables from both the left and right coalesces.
Here is an option in tidyverse
Reshape to 'long' format - pivot_longer
Grouped by 'ID'
Do the summarise across the columns 'alpha' till 'charlie'
Get the column name - cur_column()
Create a tibble with the first non-NA element from the left and the right
Change the column names by appending the 'nm1' as prefix
Finally, unnest the list columns created in summarise
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
pivot_longer(cols = contains("_"),
names_to = c( ".value", "grp"), names_sep = "_") %>%
group_by(ID) %>%
summarise(across(alpha:charlie, ~ {
nm1 <- cur_column()
tbl1 <- tibble(left= .[complete.cases(.)][1],
right = rev(.)[complete.cases(rev(.))][1]);
names(tbl1) <- str_c(nm1, "_", names(tbl1))
list(tbl1)})) %>%
unnest(c(alpha, beta, charlie))
-output
# A tibble: 3 x 7
ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
<int> <int> <int> <int> <int> <int> <int>
1 1 2 3 3 3 1 1
2 2 2 4 2 2 2 2
3 3 3 2 2 2 1 1
Or using base R
lst1 <- lapply(split.default(df1[-1], sub("_\\d+$", "", names(df1)[-1])),
function(x) {
x1 <- apply(x, 1, function(y) {
y1 <- na.omit(y)
if(length(y1) > 1 ) y1[c(1, length(y1))] else y1[1]
})
if(is.vector(x1)) as.data.frame(matrix(x1)) else as.data.frame(t(x1))
})
You could also do:
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~data.frame(right = coalesce(!!!.x),
left = coalesce(!!!rev(.x))) %>%
set_names(paste(.y, names(.), sep="_")))
alpha_right alpha_left beta_right beta_left charlie_right charlie_left
1 2 3 3 3 1 1
2 2 4 2 2 2 2
3 3 2 2 2 1 1
One more approach not as elegant as #Onyambu's
library(tidyverse)
df1[-1] %>%
split.default(sub("_\\d+", "", names(.))) %>%
imap_dfc(~ .x %>% rowwise() %>%
mutate(!!paste0(.y, '_left') := head(na.omit(c_across(everything())),1),
!!paste0(.y, '_right') := tail(na.omit(c_across(!last_col())),1),
.keep = 'none' )
)
#> # A tibble: 3 x 6
#> # Rowwise:
#> alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int>
#> 1 2 3 3 3 1 1
#> 2 2 4 2 2 2 2
#> 3 3 2 2 2 1 1
Created on 2021-06-19 by the reprex package (v2.0.0)
Another option
library(tidyverse)
df1 <- structure(list(ID = 1:3, alpha_1 = c(2L, 2L, 3L),
alpha_2 = c(1L, 2L,
3L), alpha_3 = c(4L, 4L, 2L), alpha_4 = c(3L, NA, NA), beta_1 = c(NA,
2L, NA), beta_2 = c(3L, NA, 2L), charlie_1 = c(1L, NA, 1L), charlie_2 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
df1 %>%
pivot_longer(cols = -ID, names_sep = "_", names_to = c(".value", "set")) %>%
group_by(ID) %>%
fill(alpha:charlie, .direction = "updown") %>%
filter(set %in% range(set)) %>%
mutate(set = c("left", "right")) %>%
pivot_wider(id_cols = ID, names_from = set, values_from = alpha:charlie)
#> # A tibble: 3 x 7
#> # Groups: ID [3]
#> ID alpha_left alpha_right beta_left beta_right charlie_left charlie_right
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 1 2 3 3 3 1 1
#> 2 2 2 4 2 2 2 2
#> 3 3 3 2 2 2 1 1
Created on 2021-06-20 by the reprex package (v2.0.0)

Removing all columns summing to zero with dplyr

I'm currently working on a dataframe that looks something like this:
Site Spp1 Spp2 Spp3 LOC TYPE
S01 2 4 0 A FLOOD
S02 4 0 0 A REG
....
S10 0 1 0 B FLOOD
S11 1 0 0 B REG
What I'm trying to do is subset the dataframe so I can run some indicator species analysis in R.
The following code works in that I create two subsets of the data, merge them into one frame and then drop the unused factor levels
A.flood <- filter(data, TYPE == "FLOOD", LOC == "A")
B.flood <- filter(data, TYPE == "FLOOD", LOC == "B")
A.B.flood <- rbind(A.flood, B.flood) %>% droplevels.data.frame(A.B.flood, except = c("A", "B"))
What I was also hoping/need to do is to drop all Spp columns (in my real dataset there are ~ 60) that sum to zero. Is there a way to achieve this this with dplyr, and if there is, is it possible to pipe that code onto the existing A.B.flood dataframe code?
Thanks!
EDIT
I managed to remove all the columns that summed to zero, by selecting only the columns that summed to > zero:
A.B.flood.subset <- A.B.flood[, apply(A.B.flood[1:(ncol(A.B.flood))], 2, sum)!=0]
I realize this question is now quite old, but I came accross and found another solution using dplyr's "select" and "which", which might seem clearer to dplyr's enthusiasts:
A.B.flood.subset <- A.B.flood %>% select(which(!colSums(A.B.flood, na.rm=TRUE) %in% 0))
Without using any package, we can use rowSums of the 'Spp' columns (subset the columns using grep) and double negate so that rows with sum>0 will be TRUE and others FALSE. Use this index to subset the rows.
data[!!rowSums(data[grep('Spp', names(data))]),]
Or using dplyr/magrittr, we select the 'Spp' columns, get the sum of each row with Reduce, double negate and use extract from magrittr to subset the original dataset with the index derived.
library(dplyr)
library(magrittr)
data %>%
select(matches('^Spp')) %>%
Reduce(`+`, .) %>%
`!` %>%
`!` %>%
extract(data,.,)
data
data <- structure(list(Site = c("S01", "S02", "S03", "S04"),
Spp1 = c(2L,
4L, 0L, 4L), Spp2 = c(4L, 0L, 0L, 0L), Spp3 = c(0L, 0L, 0L, 0L
), LOC = c("A", "A", "A", "A"), TYPE = c("FLOOD", "REG",
"FLOOD",
"REG")), .Names = c("Site", "Spp1", "Spp2", "Spp3", "LOC",
"TYPE"), class = "data.frame", row.names = c(NA, -4L))
You should convert to tidy data with tidyr::gather() and the data frame will be much easier to manipulate.
library(tidyr)
library(dplyr)
A.B.Flood %>% gather(Species, Sp.Count, -Site, -LOC, -TYPE) %>%
group_by(Species) %>%
filter(Sp.Count > 0)
Voila, your tidy data minus the zero counts.
# Site LOC TYPE Species Sp.Count
# <fctr> <fctr> <fctr> <chr> <int>
#1 S01 A FLOOD Spp1 2
#2 S02 A REG Spp1 4
#3 S11 B REG Spp1 1
#4 S01 A FLOOD Spp2 4
#5 S10 B FLOOD Spp2 1
Personally I'd keep it like this. If you want your original format back with the zero counts for the non-discarded species, just add %>% spread(Species, Sp.Count, fill = 0) to the pipeline.
# Site LOC TYPE Spp1 Spp2
#* <fctr> <fctr> <fctr> <dbl> <dbl>
#1 S01 A FLOOD 2 4
#2 S02 A REG 4 0
#3 S10 B FLOOD 0 1
#4 S11 B REG 1 0
There is an even easier and quicker way to do this (and also more in line with your question: using dplyr).
A.B.flood.subset <- A.B.flood[, colSums(A.B.flood != 0) > 0]
or with a MWE:
df <- data.frame (x = rnorm(100), y = rnorm(100), z = rep(0, 100))
df[, colSums(df != 0) > 0]
For those who want to use dplyr 1.0.0 with the where keyword, you can do:
A.B.flood %>%
select(where( ~ is.numeric(.x) && sum(.x) != 0))
returns:
Spp1 Spp2
1 2 4
2 4 0
3 0 0
4 4 0
using the same data given by #akrun:
A.B.flood <- structure(
list(
Site = c("S01", "S02", "S03", "S04"),
Spp1 = c(2L,
4L, 0L, 4L),
Spp2 = c(4L, 0L, 0L, 0L),
Spp3 = c(0L, 0L, 0L, 0L),
LOC = c("A", "A", "A", "A"),
TYPE = c("FLOOD", "REG",
"FLOOD",
"REG")
),
.Names = c("Site", "Spp1", "Spp2", "Spp3", "LOC",
"TYPE"), class = "data.frame", row.names = c(NA, -4L))

Resources