so I have a dataframe that looks like this:
datInput <- tibble(id = 1:2,
c.0.opt = c("a,b", "c,d"),
c.0.optI = c("1,2", "3,4"),
c.0.sel = c("a", "c"),
c.1.opt = c("e,f", "g,h"),
c.1.optI = c("5,6", "7,8"),
c.1.sel = c("e", "g"))
datInput
# id c.0.opt c.0.optI c.0.sel c.1.opt c.1.optI c.1.sel
#1 1 a,b 1,2 a e,f 5,6 e
#2 2 c,d 3,4 c g,h 7,8 g
And I need it to look like this:
datOutput <- tibble(id = c(1,1,2,2),
c_opt = c("a,b", "e,f", "c,d", "g,h"),
c_optI = c("1,2", "5,6", "3,4", "7,8"),
c_sel = c("a", "e", "c", "g"))
# id c_opt c_optI c_sel
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
I usually use dplyr::pivot_longer for this kind of tasks, but I don't know how to do it with those complicated column names, were the row identifier is in the middle. Is there a way to do this?
Thanks
We can use pivot_longer as well with names_sep as regex lookaround to match the . in column names that succeeds a digit
library(dplyr)
library(tidyr)
library(stringr)
pivot_longer(datInput, cols = -id, names_to = c("grp", ".value"),
names_sep = "(?<=\\d)\\.") %>%
select(-grp) %>%
rename_with(~ str_c('c_', .), -id)
# A tibble: 4 x 4
# id c_opt c_optI c_sel
# <int> <chr> <chr> <chr>
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
datInput %>%
gather(colname, val,-1 ) %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
# A tibble: 4 x 4
id c_opt c_optI c_sel
<int> <chr> <chr> <chr>
1 1 a,b 1,2 a
2 1 e,f 5,6 e
3 2 c,d 3,4 c
4 2 g,h 7,8 g
I modified Akrun's answer with comments from zimia like this:
datOutput <- datInput %>%
pivot_longer(-id, names_to = "colname", values_to = "val") %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
It works perfectly. Thank you both.
Related
I have two dataframe like this
```
v1. v2
1 a,b,c 1,2,3
2 d,e,f,g 4,6
3 h,k,v,x 9,0
```
```
v1 v2
1 a AA
2 c CC
3 d DD
```
after combine
```
v1 v2 v3
1 a,b,c 1,2,3 AA,CC
2. d,e,f,g 4,6 DD
3 h,k,v,x 9,0
```
I dont know how to perform like this , any comment would be appreciated
library(tidyverse)
library(fuzzyjoin)
df1 %>%
regex_left_join(., df2, by = c(v1 = "v1")) %>%
group_by(v1 = v1.x, v2 = v2.x) %>%
summarise(v3 = paste0(v2.y, collapse = ","))
# v1 v2 v3
# <chr> <chr> <chr>
# 1 a,b,c 1,2,3 AA,CC
# 2 d,e,f,g 4,6 DD
# 3 h,k,v,x 9,0 NA
Sample data used
df1 <- read.table(text = "v1 v2
a,b,c 1,2,3
d,e,f,g 4,6
h,k,v,x 9,0", header = TRUE)
df2 <- read.table(text = "v1 v2
a AA
c CC
d DD", header = TRUE)
Here is a very long tidyverse pipe. There should be simpler solutions.
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row_number()) %>%
separate(v1, into = c("v1a", "v1b", "v1c", "v1d"), fill = "right") %>%
pivot_longer(
cols = starts_with("v1"),
names_to = "v1_col",
values_to = "v1"
) %>%
na.omit() %>%
separate(v2, into = c("v2a", "v2b", "v2c"), fill = "right") %>%
pivot_longer(
cols = starts_with("v2"),
names_to = "v2_col",
values_to = "v2_value"
) %>%
na.omit() %>%
select(-ends_with("col")) %>%
left_join(df2, by = "v1") %>%
group_by(id, v1, v2) %>%
summarise(v2_value = paste(v2_value, collapse = ","),
.groups = "drop") %>%
group_by(id, v2_value) %>%
summarise(v1 = paste(v1, collapse = ","),
v3 = paste(na.omit(v2), collapse = ","),
.groups = "drop") %>%
ungroup() %>%
select(-id) %>%
rename(v2 = v2_value) %>%
relocate(v2, .after = v1)
## A tibble: 3 x 3
# v1 v2 v3
# <chr> <chr> <chr>
#1 a,b,c 1,2,3 "AA,CC"
#2 d,e,f,g 4,6 "DD"
#3 h,k,v,x 9,0 ""
Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))
I'd like to add more rows to my dataset based on a grouping variable. Right now, my data has 2 rows but I would like 3 rows and the var app to be repeated for the third row.
This is what my data currently looks like:
my_data <- data.frame(app = c('a','b'), type = c('blue','red'), code = c(1:2), type_2 = c(NA, 'blue'), code_2 = c(NA, 3))
app type code type_2 code_2
a blue 1 NA NA
b red 2 blue 3
I would like the data to look like this:
app type code
a blue 1
b red 2
b blue 3
library(data.table)
setDT(my_data)
res <-
melt(
my_data,
id.vars = "app",
measure.vars = patterns(c("^type", "^code")),
value.name = c("type", "code")
)[!is.na(type), .(app, type, code)]
Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
my_data %>%
rename_at(vars(c(type, code)), ~ str_c(., "_1")) %>%
pivot_longer(cols = -app, names_to = c(".value", "grp"), names_sep = "_",
values_drop_na = TRUE) %>% select(-grp)
# A tibble: 3 x 3
# app type code
# <chr> <chr> <dbl>
#1 a blue 1
#2 b red 2
#3 b blue 3
I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4
I would like to deduplicate my data, keeping the row that has the most frequent appearances. If there is a tie in rows, I don't care which gets returned—the first in alphabetical or numeric order is fine. I would like to do this by group of id and var.
MRE:
df <- data.frame(
id = rep("a", 8),
var = c(rep("b", 4), rep("c", 4)),
val = c("d", "d", "d", "e", "f", "f", "g", "g")
)
> df
id var val
1 a b d
2 a b d
3 a b d
4 a b e
5 a c f
6 a c f
7 a c g
8 a c g
Should be:
id var val
1 a b d
2 a c f
I'm working with large datasets and tidyverse pipe chains, so a dplyr solution would be preferable.
Use table and which.max to extract the mode:
df %>%
group_by(id, var) %>%
summarise(val = {t <- table(val); names(t)[which.max(t)] })
# A tibble: 2 x 3
# Groups: id [?]
# id var val
# <fct> <fct> <chr>
#1 a b d
#2 a c f
Another way to do this in base R: Create a three way contingency table directly, and then find the max column along the third axis:
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
# var
#id b c
# a "d" "f"
Convert this to a data frame:
as.data.frame.table(
apply(table(df), c(1, 2), function(v) names(v)[which.max(v)])
)
# id var Freq
#1 a b d
#2 a c f
Using dplyr:
library(dplyr)
df %>%
group_by(id, var, val) %>%
summarise(n = n()) %>%
group_by(id, var) %>%
arrange(-n) %>%
slice(1) %>%
ungroup() %>%
select(-n)
# # A tibble: 2 x 3
# id var val
# <fct> <fct> <fct>
# 1 a b d
# 2 a c f
One option could be using table and max as:
library(dplyr)
df %>% group_by(id, var) %>%
filter(table(val) == max(table(val))) %>%
slice(1)
# # A tibble: 2 x 3
# # Groups: id, var [2]
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c g
NOTE: a c g is case of tie. Per OP any record can be returned in case of tie.
I doubt this is any faster, but another option is
df %>%
group_by(id, var) %>%
filter(row_number() == rle(as.character(val))$lengths %>%
{sum(.[1:which.max(.)])})
A dplyr solution using count:
library(dplyr)
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
summarize_at("val",head,1)
# # A tibble: 2 x 3
# id var val
# <fctr> <fctr> <fctr>
# 1 a b d
# 2 a c f
or maybe more idiomatic but longer:
df %>%
count(id,var,val,sort = T) %>%
group_by(id,var) %>%
slice(1) %>%
select(-n) %>%
ungroup
Or with tally for same output with slightly different syntax:
df %>%
group_by(id,var,val) %>%
tally(sort = T) %>%
slice(1) %>%
select(-n) %>%
ungroup
and a base solution :
df2 <- aggregate(x ~ .,cbind(df,x=1),sum)
aggregate(val ~ id+var, df2[order(-df2$x),],head,1)
# id var val
# 1 a b d
# 2 a c f
Here is my try:
library(dplyr)
df %>%
group_by(id,var,val) %>%
mutate(n=n()) %>%
arrange(desc(n)) %>%
group_by(id,var) %>%
filter(row_number()==1) %>%
select(-n)
`