Extract non-missing elements by rows and stack them - r

I have a data frame like this
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = F)
# id V1 V2 V3 V4
# 1 1 A <NA> <NA> <NA>
# 2 2 <NA> <NA> B <NA>
# 3 3 C <NA> <NA> D
# 4 4 <NA> E F <NA>
How can I extract non-missing elements by rows and stack them into a column? My expected output is:
# id value
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F

Try pivot_longer() or unite() + separate_rows().
library(tidyr)
library(dplyr)
# Method 1
df %>%
pivot_longer(-id, values_drop_na = T) %>%
select(-name)
# Method 2
df %>%
unite(value, -id, na.rm = T) %>%
separate_rows(value)
# # A tibble: 6 x 2
# id value
# <int> <chr>
# 1 1 A
# 2 2 B
# 3 3 C
# 4 3 D
# 5 4 E
# 6 4 F

You can use dplyr and tidyr:
df %>%
tidyr::gather(-id, key = "key", value = "value") %>%
dplyr::filter(!is.na(value))
id key value
1 1 V1 A
2 3 V1 C
3 4 V2 E
4 2 V3 B
5 4 V3 F
6 3 V4 D

One base R solution could be:
na.omit(data.frame(df[1], stack(df[-1])[1]))
id values
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D

How about combining complete.cases with reshape library?
library(reshape2)
df.temp <- melt(df, id.vars = "id")
df.temp[complete.cases(df.temp),-2]
results in
id value
1 1 A
3 3 C
8 4 E
10 2 B
12 4 F
15 3 D

pivot_longer then filter
library(tidyverse)
df <- data.frame(id = 1:4,
V1 = c("A", NA, "C", NA),
V2 = c(NA, NA, NA, "E"),
V3 = c(NA, "B", NA, "F"),
V4 = c(NA, NA, "D", NA), stringsAsFactors = FALSE)
df %>% pivot_longer(-id, names_to = "name", values_to = "value") %>%
filter(!is.na(value)) %>%
select(-name)
#> # A tibble: 6 x 2
#> id value
#> <int> <chr>
#> 1 1 A
#> 2 2 B
#> 3 3 C
#> 4 3 D
#> 5 4 E
#> 6 4 F
Created on 2020-03-02 by the reprex package (v0.3.0)

Related

Filter by group and conditions

I have this type of data, where Sequis a grouping variable:
df <- data.frame(
Sequ = c(1,1,1,
2,2,2,
3,3,
4,4),
Answerer = c("A", NA, NA, "A", NA, NA, "B", NA, "C", NA),
PP_by = c(rep("A",5), rep("B",5)),
pp = c(0.1,0.2,0.3, 1, NA, NA, NA, NA, NA, NA)
)
I need to remove any Sequ where
(i) Answerer == PP_by AND
(ii) there is any NA in pp
I've tried this, but it obviously implements just the first condition (i):
library(dplyr)
df %>%
group_by(Sequ) %>%
filter(
all(!is.na(pp))
)
The expected result is:
Sequ Answerer PP_by pp
1 1 A A 0.1
2 1 <NA> A 0.2
3 1 <NA> A 0.3
9 4 C B NA
10 4 <NA> B NA
EDIT:
I've come up with this solution:
df %>%
group_by(Sequ) %>%
filter(
first(Answerer) != first(PP_by)
|
all(!is.na(pp))
)
Here's another way:
df %>%
group_by(Sequ) %>%
filter(!(
any(Answerer == PP_by, na.rm = TRUE) &
any(is.na(pp))
))
# # A tibble: 5 × 4
# # Groups: Sequ [2]
# Sequ Answerer PP_by pp
# <dbl> <chr> <chr> <dbl>
# 1 1 A A 0.1
# 2 1 NA A 0.2
# 3 1 NA A 0.3
# 4 4 C B NA
# 5 4 NA B NA

Using complete to fill groups with NA to have same length as the maximum group

I have this dataframe:
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 3L), var = c("A", "B",
"C", "B", "C", "C")), class = "data.frame", row.names = c(NA,
-6L))
id var
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 3 C
I would like to get this dataframe:
id var
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
I would like to learn how to use complete or expand.grid in this situation
I have tried several ways but was not successful: One of my tries:
df %>%
complete(id, var, fill=list(NA))
Create a duplicate column of 'var' and then do the complete on the other column, which makes the NA in the 'var' column and then remove the duplicate 'var' column
library(dplyr)
library(tidyr)
df %>%
mutate(var1 = var) %>%
complete(id, var1) %>%
select(-var1)
-output
# A tibble: 9 × 2
id var
<int> <chr>
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C

Wide to long, combining columns in pairs but keeping ID column - R

I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))

Remove NA in front of one specific string but leave in front of another specific string, by group

I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.

Getting top_n when multiple groups and keep subgroups

I have a df with turnover for each subgroup from main. Now I want to get the two main groups with highest turnover.
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15))
> df
grp sub_grp turnover
1 A 1 98.14430
2 A 2 107.90811
3 A 3 103.93973
4 A 4 95.78222
5 A 5 63.19635
6 B 1 97.85688
7 B 2 92.65572
8 B 3 86.02872
9 B 4 101.88177
10 B 5 120.66959
11 C 1 125.93533
12 C 2 98.49771
13 C 3 77.28770
14 C 4 101.44822
15 C 5 107.08171
16 D 1 77.73252
17 D 2 107.49374
18 D 3 87.46436
19 D 4 101.49984
20 D 5 99.13047
21 E 1 91.48636
22 E 2 115.63716
23 E 3 99.34567
24 E 4 104.65408
25 E 5 121.41820
I know how to get two main groups with highest turnover, but not how to keep my subgroups and turnover still split on subgroup.
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2)
grp total.turnover
(fctr) (dbl)
1 C 510.2507
2 E 532.5415
Result I want from this example.
grp sub_grp turnover
1 C 1 125.93533
2 C 2 98.49771
3 C 3 77.28770
4 C 4 101.44822
5 C 5 107.08171
6 E 1 91.48636
7 E 2 115.63716
8 E 3 99.34567
9 E 4 104.65408
10 E 5 121.41820
Here are a couple of different approaches with dplyr.
Rejoining the original object
df %>%
group_by(grp) %>%
summarise(total.turnover = sum(turnover)) %>%
top_n(n = 2) %>%
inner_join(df, by = "grp") %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using a windowing function (dense_rank)
df %>%
group_by(grp) %>%
mutate(total.turnover = sum(turnover)) %>%
ungroup() %>%
filter(dense_rank(desc(total.turnover)) < 3) %>%
select(grp, sub_grp, turnover)
# # A tibble: 10 × 3
# grp sub_grp turnover
# <fctr> <fctr> <dbl>
# 1 A 1 91.59287
# 2 A 2 96.54734
# 3 A 3 123.38062
# 4 A 4 101.05763
# 5 A 5 101.93932
# 6 C 1 118.36123
# 7 C 2 105.39721
# 8 C 3 106.01157
# 9 C 4 101.66024
# 10 C 5 91.66238
Using data.table (similar to the dplyr windowing function approach)
library(data.table)
dt <- data.table(df)
dt[,total.turnover := sum(turnover), by = .(grp)
][,rank := frank(-total.turnover, ties.method = "dense")
][rank < 3, .(grp, sub_grp, turnover)]
# grp sub_grp turnover
# 1: A 1 91.59287
# 2: A 2 96.54734
# 3: A 3 123.38062
# 4: A 4 101.05763
# 5: A 5 101.93932
# 6: C 1 118.36123
# 7: C 2 105.39721
# 8: C 3 106.01157
# 9: C 4 101.66024
# 10: C 5 91.66238
library(dplyr)
set.seed(123)
df <- data.frame(
grp = gl(5, 5, labels = c("A", "B", "C", "D", "E")),
sub_grp = gl(5, 1),
turnover = rnorm(25, mean = 100, sd = 15)
)
One option is dplyr where we use filter on the summarised output object
df %>%
filter(grp %in% df1$grp)
where 'df1' is the summarised output object
Or if we wanted in the same chain
df %>%
group_by(grp) %>%
summarise(val = sum(turnover)) %>%
top_n(2) %>%
semi_join(df, .)
# grp sub_grp turnover
#1 C 1 125.93533
#2 C 2 98.49771
#3 C 3 77.28770
#4 C 4 101.44822
#5 C 5 107.08171
#6 E 1 91.48636
#7 E 2 115.63716
#8 E 3 99.34567
#9 E 4 104.65408
#10 E 5 121.41820
Or another one-line option is data.table
library(data.table)
setDT(df)[grp %in% df[, sum(turnover), grp][order(-V1), head(grp, 2)]]
# grp sub_grp turnover
# 1: C 1 125.93533
# 2: C 2 98.49771
# 3: C 3 77.28770
# 4: C 4 101.44822
# 5: C 5 107.08171
# 6: E 1 91.48636
# 7: E 2 115.63716
# 8: E 3 99.34567
# 9: E 4 104.65408
#10: E 5 121.41820
Or we can do this easily with base R
subset(df, grp %in% names(tail(sort(xtabs(turnover~grp , df)),2)))
# grp sub_grp turnover
#11 C 1 125.93533
#12 C 2 98.49771
#13 C 3 77.28770
#14 C 4 101.44822
#15 C 5 107.08171
#21 E 1 91.48636
#22 E 2 115.63716
#23 E 3 99.34567
#24 E 4 104.65408
#25 E 5 121.41820
data
df <- structure(list(grp = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "D", "E",
"E", "E", "E", "E"), sub_grp = c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L), turnover = c(98.1443, 107.90811, 103.93973, 95.78222,
63.19635, 97.85688, 92.65572, 86.02872, 101.88177, 120.66959,
125.93533, 98.49771, 77.2877, 101.44822, 107.08171, 77.73252,
107.49374, 87.46436, 101.49984, 99.13047, 91.48636, 115.63716,
99.34567, 104.65408, 121.4182)), .Names = c("grp", "sub_grp",
"turnover"),
class = "data.frame", row.names = c(NA, -25L),
index = structure(integer(0), "`__grp`" = integer(0)))

Resources