I have data set like this:
df<-data.frame(ID=(1:5), column1=c("AA","GG","AG","AA","AT"), column2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)
df
ID column1 column2
1 AA AA
2 GG GG
3 AG AG
4 AA AA
5 AT AT
I want to separate each column into 2 letters so the output will look something like this:
ID column1.A column1.B column2.A column2.B
1 A A A A
2 G G G G
3 A G A G
4 A A A A
5 A T A T
Can you help me please?
library(tidyverse)
df %>%
pivot_longer(-ID) %>%
mutate(tmp = str_split(value, pattern = "")) %>%
unnest(tmp) %>%
group_by(ID, name) %>%
mutate(id_row = LETTERS[row_number()]) %>%
pivot_wider(id_cols = c(ID, name), names_from =c(name, id_row), values_from = tmp, names_sep = ".") %>%
ungroup()
#> # A tibble: 5 x 5
#> ID column1.A column1.B column2.A column2.B
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 A A A A
#> 2 2 G G G G
#> 3 3 A G A G
#> 4 4 A A A A
#> 5 5 A T A T
data
df <-
data.frame(
ID = (1:5),
column1 = c("AA", "GG", "AG", "AA", "AT"),
column2 = c("AA", "GG", "AG", "AA", "AT"),
stringsAsFactors = FALSE
)
Created on 2021-11-05 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)
melt(data = df, id.vars = "ID") %>%
.[, list(value = unlist(strsplit(value, split = ""))), by = list(ID, variable)] %>%
.[, id_row := LETTERS[rowid(ID, variable)]] %>%
dcast(formula = ID ~ variable + id_row, value.var = "value")
ID column1_A column1_B column2_A column2_B
1: 1 A A A A
2: 2 G G G G
3: 3 A G A G
4: 4 A A A A
5: 5 A T A T
Uisng strsplit.
cbind(df[1], do.call(cbind.data.frame, lapply(df[-1], function(x)
do.call(rbind, strsplit(x, '')))))
# ID column1.1 column1.2 column2.1 column2.2
# 1 1 A A A A
# 2 2 G G G G
# 3 3 A G A G
# 4 4 A A A A
# 5 5 A T A T
Yet another solution, tidyverse-based:
library(tidyverse)
df<-data.frame(ID=(1:5), column1=c("AA","GG","AG","AA","AT"), column2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)
df %>%
mutate(
across(
starts_with("column"), ~
str_split(get(cur_column()), "(?<=[A-Z])(?=[A-Z])", simplify = T),
.names="{.col}_sep"), column1 = NULL, column2 = NULL)
#> ID column1_sep.1 column1_sep.2 column2_sep.1 column2_sep.2
#> 1 1 A A A A
#> 2 2 G G G G
#> 3 3 A G A G
#> 4 4 A A A A
#> 5 5 A T A T
Another possibility, based on a pivot_longer followed by a pivot_wider:
library(tidyverse)
df<-data.frame(ID=(1:5), column1=c("AA","GG","AG","AA","AT"), column2=c("AA","GG","AG","AA","AT"), stringsAsFactors=FALSE)
df %>%
pivot_longer(-ID) %>%
separate(value, into=LETTERS[1:2], sep= "(?<=[A-Z])(?=[A-Z])") %>%
pivot_wider(ID, names_from = "name", values_from = c(A,B),
names_glue = "{name}.{.value}") %>%
relocate(column1.B,.before=column2.A)
#> # A tibble: 5 × 5
#> ID column1.A column1.B column2.A column2.B
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 A A A A
#> 2 2 G G G G
#> 3 3 A G A G
#> 4 4 A A A A
#> 5 5 A T A T
Related
I have a data frame containing a varying number of data points in the same column:
library(tidyverse)
df <- tribble(~id, ~data,
"A", "a;b;c",
"B", "e;f")
I want to obtain one row per data point, separating the content of column data and distributing it on rows. This code gives the expected result, but is clumsy:
df %>%
separate(data,
into = paste0("dat_",1:5),
sep = ";",
fill = "right") %>%
pivot_longer(starts_with("dat_"),
names_to = "data_number",
names_pattern = "dat_(\\d+)") %>%
filter(!is.na(value))
#> # A tibble: 5 x 3
#> id data_number value
#> <chr> <chr> <chr>
#> 1 A 1 a
#> 2 A 2 b
#> 3 A 3 c
#> 4 B 1 e
#> 5 B 2 f
Tidyverse solutions preferred.
Here is one way
library(dplyr)
library(tidyr)
library(data.table)
df %>%
separate_rows(data) %>%
mutate(data_number = rowid(id), .before = 2)
-output
# A tibble: 5 x 3
id data_number data
<chr> <int> <chr>
1 A 1 a
2 A 2 b
3 A 3 c
4 B 1 e
5 B 2 f
library(dplyr)
library(tidyr)
df %>%
separate_rows(data)
output:
# A tibble: 5 x 2
id data
<chr> <chr>
1 A a
2 A b
3 A c
4 B e
5 B f
Using str_split and unnest -
library(tidyverse)
df %>%
mutate(data = str_split(data, ';'),
data_number = map(data, seq_along)) %>%
unnest(c(data, data_number))
# id data data_number
# <chr> <chr> <int>
#1 A a 1
#2 A b 2
#3 A c 3
#4 B e 1
#5 B f 2
I could write a loop to do this, but I was wondering how this might be done in R with dplyr. I have a data frame with two columns. Column 1 is the group, Column 2 is the value. I would like a data frame that has every combination of two values from each group in two separate columns. For example:
input = data.frame(col1 = c(1,1,1,2,2), col2 = c("A","B","C","E","F"))
input
#> col1 col2
#> 1 1 A
#> 2 1 B
#> 3 1 C
#> 4 2 E
#> 5 2 F
and have it return
output = data.frame(col1 = c(1,1,1,2), col2 = c("A","B","C","E"), col3 = c("B","C","A","F"))
output
#> col1 col2 col3
#> 1 1 A B
#> 2 1 B C
#> 3 1 C A
#> 4 2 E F
I'd like to be able to include it within dplyr syntax:
input %>%
group_by(col1) %>%
???
I tried writing my own function that produces a data frame of combinations like what I need from a vector and sent it into the group_map function, but didn't have success:
combos = function(x, ...) {
x = t(combn(x, 2))
return(as.data.frame(x))
}
input %>%
group_by(col1) %>%
group_map(.f = combos)
Produced an error.
Any suggestions?
You can do :
library(dplyr)
data <- input %>%
group_by(col1) %>%
summarise(col2 = t(combn(col2, 2)))
cbind(data[1], data.frame(data$col2))
# col1 X1 X2
# <dbl> <chr> <chr>
#1 1 A B
#2 1 A C
#3 1 B C
#4 2 E F
input %>%
group_by(col1) %>%
nest(data=-col1) %>%
mutate(out= map(data, ~ t(combn(unlist(.x), 2)))) %>%
unnest(out) %>% select(-data)
# A tibble: 4 x 2
# Groups: col1 [2]
col1 out[,1] [,2]
<dbl> <chr> <chr>
1 1 A B
2 1 A C
3 1 B C
4 2 E F
Or :
combos = function(x, ...) {
return(tibble(col1=x[[1,1]],col2=t(combn(unlist(x[[2]], use.names=F), 2))))
}
input %>%
group_by(col1) %>%
group_map(.f = combos, .keep=T) %>% invoke(rbind,.) %>% tibble
# A tibble: 4 x 2
col1 col2[,1] [,2]
<dbl> <chr> <chr>
1 1 A B
2 1 A C
3 1 B C
4 2 E F
Thank you! In terms of parsimony, I like both the answer from Ben
input %>%
group_by(col1) %>%
do(data.frame(t(combn(.$col2, 2))))
and Ronak
data <- input %>%
group_by(col1) %>%
summarise(col2 = t(combn(col2, 2)))
cbind(data[1], data.frame(data$col2))
I would like to make this table:
Look like this:
Using dplyr:
df <- tibble(id = c(1,1,3),
b = c("foo", "bar", "foo"),
c = c("x", "y", "z"))
df
# A tibble: 3 x 3
id b c
<dbl> <chr> <chr>
1 1 foo x
2 1 bar y
3 3 foo z
df %>% group_by(id) %>%
summarize(new = paste(b, collapse = ","),
new2 = paste(c, collapse = ","))
which results in:
# A tibble: 2 x 3
a new new2
<dbl> <chr> <chr>
1 1 foo,bar x,y
2 3 foo z
Let's consider the following data frame:
set.seed(123)
data <- data.frame(col1 = factor(rep(c("A", "B", "C"), 4)),
col2 = factor(c(rep(c("A", "B", "C"), 3), c("A", "A", "A"))),
val1 = 1:12,
val2 = rnorm(12, 10, 15))
The contingency table is as follows:
cont_tab <- table(data$col1, data$col2, dnn = c("col1", "col2"))
cont_tab
col2
col1 A B C
A 4 0 0
B 1 3 0
C 1 0 3
As you can see some pairs didn't occur: (A,B), (A,C), (B,C), (C,B). The end goal of my analysis is to list all of the pairs (in this case 9) and show a statistic for each of them. While using dplyr::group_by() function I hit a limitation. Namely, the dplyr::group_by() considers only existing pairs (pairs that occured at least once):
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1))
# A tibble: 5 x 3
# Groups: col1 [?]
col1 col2 stat
<fct> <fct> <dbl>
1 A A 58.1
2 B A -16.4
3 B B 17.0
4 C A -12.9
5 C C -41.9
The output I have in mind has 9 rows (4 of which has stat equal to 0). Is it doable in dplyr?
EDIT: Sorry for being too vague at the beginning. The real problem is more complex than counting the number of times a particular pair occurs. I added the new data in order to make the real problem more visible.
It is much easier to add spread from tidyr to get the same result as with table
library(dplyr)
library(tidyr)
count(data, col1, col2) %>%
spread(col2, n, fill = 0)
# A tibble: 3 x 4
# Groups: col1 [3]
# col1 A B C
# <fct> <dbl> <dbl> <dbl>
#1 A 4 0 0
#2 B 1 3 0
#3 C 1 0 3
NOTE: The group_by/summarise step is changed to count here
As #divibisan suggested, if the OP wanted long format, then add gather at the end
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, A:C)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3
Update
With the updated data in OP's post
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1)) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, -1)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 7.76
#2 B A -20.8
#3 C A 6.97
#4 A B 0
#5 B B 28.8
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 9.56
This is doable even without dplyr
as.data.frame(table(data$col1, data$col2, dnn = c("col1", "col2")))
# col1 col2 Freq
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3
You can use tidyr::complete
library(tidyverse)
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
# additions below
ungroup %>%
complete(col1, col2, fill = list(stat = 0))
# # A tibble: 9 x 3
# col1 col2 stat
# <chr> <chr> <dbl>
# 1 A A 4
# 2 A B 0
# 3 A C 0
# 4 B A 1
# 5 B B 3
# 6 B C 0
# 7 C A 1
# 8 C B 0
# 9 C C 3
You can also use count for the first part. The code below gives the same output as the code above
data %>%
count(col1, col2) %>%
complete(col1, col2, fill = list(n = 0))
Also a tidyverse possibility using tidyr::complete():
data %>%
group_by_all() %>%
add_count() %>%
complete(col1, col2, fill = list(n = 0)) %>%
distinct()
col1 col2 n
<fct> <fct> <dbl>
1 A A 4
2 A B 0
3 A C 0
4 B A 1
5 B B 3
6 B C 0
7 C A 1
8 C B 0
9 C C 3
Or using tidyr::expand():
data %>%
count(col1, col2) %>%
right_join(data %>%
expand(col1, col2), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))
Or using tidyr::crossing():
data %>%
count(col1, col2) %>%
right_join(crossing(col1 = unique(data$col1),
col2 = unique(data$col2)), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))
Here is a little workaround, I hope it works for you. Merge your table with table of all combinations and replace NAs with 0.
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
merge(unique(expand.grid(data)), by=c("col1","col2"), all=T) %>%
replace_na(list(stat=0))
I have data:
rowID incidentID participant.type
1 1 A
2 1 B
3 2 A
4 3 A
5 3 B
6 3 C
7 4 B
8 4 C
And I would like to end up with:
rowID incident participant.type participant.type.1 participant.type.2
1 1 A B
2 2 A
3 3 A B C
4 4 B C
I tried the spread but can't achieve one line per incident; I don't think I have a way of creating a key-value pair so I wonder if there is some other method for doing this.
Before using spread(), you need to create a proper key argument.
df %>% select(-rowID) %>%
group_by(incidentID) %>%
mutate(id = 1:n()) %>%
spread(id, participant.type)
# incidentID `1` `2` `3`
# <int> <fct> <fct> <fct>
# 1 1 A B NA
# 2 2 A NA NA
# 3 3 A B C
# 4 4 B C NA
Since your grouping is based on the row order within the icidentID column. The following simple solution will also work.
It is just filtering the dataframe and then merging in the end.
It is probably not the best solution in terms of effective use of computing power, but it is easy to understand.
library(tidyverse)
df <-
tribble(
~rowID, ~incidentID, ~participant.type,
1, 1, "A",
2, 1, "B",
3, 2, "A",
4, 3, "A",
5, 3, "B",
6, 3, "C",
7, 4, "B",
8, 4, "C")
df_1 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==1)
df_2 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==2) %>%
rename(participant.type.1 = participant.type)
df_3 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==3) %>%
rename(participant.type.2 = participant.type)
full_join(df_1, full_join(df_2, df_3))
Result:
Joining, by = "incidentID"
Joining, by = "incidentID"
# A tibble: 4 x 4
# Groups: incidentID [?]
incidentID participant.type participant.type.1 participant.type.2
<dbl> <chr> <chr> <chr>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
Here's my solution:
df %>%
select(-rowID) %>%
group_by(incidentID) %>%
nest() %>%
mutate(data = map_chr(data, ~str_c(.x$participant.type, collapse = '_'))) %>%
separate(data, paste0('participant.type.', 0:2)) %>%
mutate_at(2:4, ~replace_na(.x, ''))
We can use reshape2::dcast for this
reshape2::dcast(df, insidentID ~ participant.type)
# insidentID A B C
# 1 1 <NA> B <NA>
# 2 8 <NA> B <NA>
# 3 12 <NA> <NA> C
# 4 16 A <NA> <NA>
# 5 24 <NA> B <NA>
# 6 27 <NA> B C
# 7 29 <NA> <NA> C
with the data
set.seed(123)
df <- data.frame(insidentID = sample(0:30, 8L, replace = TRUE),
participant.type = sample(LETTERS[1:3], 8L, replace = TRUE),
stringsAsFactors = FALSE)
df
# insidentID participant.type
# 1 8 B
# 2 24 B
# 3 12 C
# 4 27 B
# 5 29 C
# 6 1 B
# 7 16 A
# 8 27 C
The 'related question' link provided by #markus shows a variety of other solutions, including what appears to be the most concise in a tidyverse format:
df %>%
group_by(incidentID) %>%
mutate(rn = paste0("newcolumn",row_number())) %>%
spread(rn, participant.type)
gives:
incidentID newcolumn1 newcolumn2 newcolumn3
<int> <fct> <fct> <fct>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
A