I am working with genetic data and I need to concatenate pairs of columns. The data I have has the major and minor alleles in separate columns (e.g., allele1a, allele1b, allele2a, allele2b, etc. etc.). I need a way to pairs of columns for the entire data frame. I included a sample below, but my data has 1.7 million pairs (so I have 3.4 million columns right now), so it will not work if I need to name each column. I will change the column names later. Any guidance is appreciated if there is a way to do this in R. I have tried to create a sequence and paste them, something like:
df <- data.frame(id = seq(1,20),
var1 = rep("A", 20),
var2 = c(rep("T", 10), rep("A", 10)),
var3 = rep("C", 20),
var4 = c(rep("C", 10), rep("G", 10)),
var5 = rep("A", 20),
var6 = c(rep("A", 10), rep("G", 10)),
stringsAsFactors = FALSE)
i <- seq.int(1, length(ped), by = 2L)
df <- paste0(df[i], df[i+1])
but that did not work. I want it to go from:
id var1 var2 var3 var4 var5 var6
1 1 A T C C A A
2 2 A T C C A A
3 3 A T C C A A
4 4 A T C C A A
5 5 A T C C A A
6 6 A T C C A A
7 7 A T C C A A
8 8 A T C C A A
9 9 A T C C A A
10 10 A T C C A A
11 11 A A C G A G
12 12 A A C G A G
13 13 A A C G A G
14 14 A A C G A G
15 15 A A C G A G
16 16 A A C G A G
17 17 A A C G A G
18 18 A A C G A G
19 19 A A C G A G
20 20 A A C G A G
to:
id var1 var2 var3
1 1 AT CC AA
2 2 AT CC AA
3 3 AT CC AA
4 4 AT CC AA
5 5 AT CC AA
6 6 AT CC AA
7 7 AT CC AA
8 8 AT CC AA
9 9 AT CC AA
10 10 AT CC AA
11 11 AA CG AG
12 12 AA CG AG
13 13 AA CG AG
14 14 AA CG AG
15 15 AA CG AG
16 16 AA CG AG
17 17 AA CG AG
18 18 AA CG AG
19 19 AA CG AG
20 20 AA CG AG
edit:
Thank you!!! I was able to adapt two of the answers for my data and #akrun's ran a little faster. I created a subset of my data with 100 rows and 100,000 columns and the results are below:
microbenchmark(
+ {
+ new <- ped %>%
+ gather(key = V, value = value, -id) %>%
+ mutate(V = str_extract(V, "\\d+") %>% as.numeric()) %>%
+ group_by(id) %>%
+ mutate(pair = ceiling(V / 2)) %>%
+ group_by(id, pair) %>%
+ summarise(combined = paste(value, collapse = "")) %>%
+ mutate(V_combo = paste0("V", pair)) %>%
+ select(-pair) %>%
+ spread(key = V_combo, value = combined) %>%
+ select(id, paste0("V", seq(1, ncol(.)-1, 1)))
+ },
+ {
+ out <- ped[1]
+ new_cols <- paste0("V", seq(1, (ncol(ped)-1)/2))
+
+ out[new_cols] <- lapply(seq(2, ncol(ped)-1, 2),
+ function(i) do.call(paste0, ped[i:(i+1)]))
+ },
+ times = 1
+ )
Unit: seconds
expr min lq mean median uq max neval
camille 250.30901 250.30901 250.30901 250.30901 250.30901 250.30901 1
akrun 23.52434 23.52434 23.52434 23.52434 23.52434 23.52434 1
>
> new <- data.frame(new, stringsAsFactors = FALSE)
> identical(new, out)
[1] TRUE
We can create a loop to subset the columns along with the adjacent column, paste it together withdo.call` and assign it as new columns to the new dataset
out <- df[1]
out[paste0("var", 1:3)] <- lapply(seq(2, ncol(df), 2),
function(i) do.call(paste0, df[i:(i+1)]))
Here's a tidyverse way designed to scale fairly well. Instead of hard-coding that you want to pair columns 1 & 2, 3 & 4, and 5 & 6, I'm reshaping to long data to get a variable number, grouping those into pairs by dividing the variable number by 2, collapsing the letters in each pair, and reshaping back to wide. This way, you can do the same procedure on any even number of columns.
library(tidyverse)
...
Filtering for ID 1 to show a glimpse of this:
df %>%
gather(key = var, value = value, -id) %>%
mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>%
group_by(id) %>%
mutate(pair = ceiling(var / 2)) %>%
filter(id == 1)
#> # A tibble: 6 x 4
#> # Groups: id [1]
#> id var value pair
#> <int> <dbl> <chr> <dbl>
#> 1 1 1 A 1
#> 2 1 2 T 1
#> 3 1 3 C 2
#> 4 1 4 C 2
#> 5 1 5 A 3
#> 6 1 6 A 3
Then collapsing strings as a summarizing value for each combination of ID and pair:
df %>%
gather(key = var, value = value, -id) %>%
mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>%
group_by(id) %>%
mutate(pair = ceiling(var / 2)) %>%
group_by(id, pair) %>%
summarise(combined = paste(value, collapse = ""))
#> # A tibble: 60 x 3
#> # Groups: id [?]
#> id pair combined
#> <int> <dbl> <chr>
#> 1 1 1 AT
#> 2 1 2 CC
#> 3 1 3 AA
#> 4 2 1 AT
#> 5 2 2 CC
#> 6 2 3 AA
#> 7 3 1 AT
#> 8 3 2 CC
#> 9 3 3 AA
#> 10 4 1 AT
#> # ... with 50 more rows
And using spread to get back into a wide format.
df %>%
gather(key = var, value = value, -id) %>%
mutate(var = str_extract(var, "\\d+") %>% as.numeric()) %>%
group_by(id) %>%
mutate(pair = ceiling(var / 2)) %>%
group_by(id, pair) %>%
summarise(combined = paste(value, collapse = "")) %>%
mutate(var_combo = paste0("var", pair)) %>%
select(-pair) %>%
spread(key = var_combo, value = combined) %>%
head()
#> # A tibble: 6 x 4
#> # Groups: id [6]
#> id var1 var2 var3
#> <int> <chr> <chr> <chr>
#> 1 1 AT CC AA
#> 2 2 AT CC AA
#> 3 3 AT CC AA
#> 4 4 AT CC AA
#> 5 5 AT CC AA
#> 6 6 AT CC AA
Created on 2018-11-07 by the reprex package (v0.2.1)
Using tidyverse, you can compose the modifying expressions ahead of time, then pass them all to transmute in bulk. This solution uses column names and is therefore robust to the column ordering: if you shuffle your allele columns, this should still give you the same answer.
library( tidyverse )
# Create expressions of the form allele1 = str_c(allele1a, allele1b)
v <- str_c("allele",1:3) %>% set_names %>%
map( ~glue::glue("str_c({.}a, {.}b)") ) %>% map( rlang::parse_expr )
df %>% transmute( id = id, !!!v )
# # A tibble: 20 x 4
# id allele1 allele2 allele3
# <int> <chr> <chr> <chr>
# 1 1 AT CC AA
# 2 2 AT CC AA
# 3 3 AT CC AA
# 4 4 AT CC AA
# ...
I modified your data to closer match your description:
df <- data_frame(id = seq(1,20),
allele1a = rep("A", 20),
allele1b = c(rep("T", 10), rep("A", 10)),
allele2a = rep("C", 20),
allele2b = c(rep("C", 10), rep("G", 10)),
allele3a = rep("A", 20),
allele3b = c(rep("A", 10), rep("G", 10)))
using base r you could do:
a <- seq(2,ncol(df),2)
b <- paste0(unlist(df[a]),unlist(df[a+1]))
d <- data.frame(matrix(b,nrow(df)))
result <- cbind(df[1],d)
This can also be written in a one line:
(dat = data.frame(matrix(paste0(unlist(df[a<-seq(2,ncol(df),2)]),unlist(df[a+1])),nrow(df))))
X1 X2 X3
1 AT CC AA
2 AT CC AA
3 AT CC AA
4 AT CC AA
5 AT CC AA
6 AT CC AA
7 AT CC AA
8 AT CC AA
9 AT CC AA
10 AT CC AA
11 AA CG AG
12 AA CG AG
13 AA CG AG
14 AA CG AG
15 AA CG AG
16 AA CG AG
17 AA CG AG
18 AA CG AG
19 AA CG AG
20 AA CG AG
Then cbind it with the id column:
cbind(df[1],dat)
df <- data.frame(id = seq(1,20),
var1 = rep("A", 20),
var2 = c(rep("T", 10), rep("A", 10)),
var3 = rep("C", 20),
var4 = c(rep("C", 10), rep("G", 10)),
var5 = rep("A", 20),
var6 = c(rep("A", 10), rep("G", 10)),
stringsAsFactors = FALSE)
df2 <- data.frame(id = df[,1], var1 = paste(df[,2], df[,3], sep = ""),
var2 = paste(df[,4], df[,5], sep = ""),
var3 = paste(df[,6], df[,7], sep = ""))
Related
I want to select a row for each group created by variable a. It should be the row with the highest value for variable c, but if variable b is TRUE, then the row with b = TRUE and maximum c within that group should be selected.
I have the following code:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df %>% group_by(a) %>% filter(b == 1) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df1
df %>% group_by(a) %>% filter(all(b != 1)) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df2
df3 <- rbind(df1, df2)
This works, but I wonder if there is a simpler way to achieve the same.
You could filter the values for groups and then do your summarize.
df %>%
group_by(a) %>%
filter(all(b==0) | b==1) %>%
summarize(b = first(b), c = max(c))
# a b c
# <int> <dbl> <int>
# 1 1 0 8
# 2 2 1 5
# 3 3 1 9
So we only keep the values per group if b==1 or if all b==0
We can do it with ifelse inside summarise and without the need to filter b values.
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
cc <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
cc = cc)
df |>
group_by(a) |>
summarise(b = max(b),teste = ifelse(any(b == 1), max(cc[b == 1]), max(cc)) )
Also, never name something c in R.
library(data.table)
setDT(df)
# select the maximum c value, grouped by a and b
# then negative order by b (so rows with b == 1 get on top),
# and select the first row of each a-group
df[df[, .I[c == max(c)], by = .(a,b)]$V1][order(a,-b), .SD[1], by = a]
library(dplyr)
df %>% group_by(a) %>%
arrange(desc(b),desc(c), .by_group = T) %>%
slice_head(n = 1) %>%
ungroup()
#> # A tibble: 3 × 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 0 8
#> 2 2 1 5
#> 3 3 1 9
Input data:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df
#> a b c
#> 1 1 0 8
#> 2 1 0 7
#> 3 1 0 4
#> 4 2 0 1
#> 5 2 1 5
#> 6 2 1 2
#> 7 3 1 9
#> 8 3 1 3
#> 9 3 0 6
Created on 2023-01-30 with reprex v2.0.2
library(dplyr)
mydf <- data.frame(a_x = c(1,2,3,4,5),
b_x = c(8,9,10,11,12),
a_y = c("k",'b','a','d','z'),
b_y = c('aa','bb','cc','dd','ee'),
prefix=c("a","b","c","a","a"))
mydf
Assuming that the data I have is mydf, I would like to produce the same result as mydf2.
I made a column with the name of the column containing the value to be extracted.
I want to extract the value through this column.
mydf2 <- data.frame(a_x=c(1,2,3,4,5),
b_x=c(8,9,10,11,12),
prefix=c("a","b","c","a","a"),
desired_x_value = c(1,9,NA,4,5),
desired_y_value = c('k','bb',NA,'d','z'))
mydf2
I've used 'get' and 'paste0' but it doesn't work. Can I solve this problem through 'dplyr' chain?
mydf %>% mutate(desired_x_value = get(paste0(prefix,"_x")),
desired_y_value = get(paste0(prefix,"_y")))
So basically you want to create new columns (desired_x_value and desired_y_value) of which its value depends on a condition. Using dplyr I prefer case_when as it is the best readable way to do it, but you could also use (nested) if(else) statements. What it is doing is "if X meets condition A do Y, if X meets condition B do Z, if X meets condition .... do ..."
mydf %>%
dplyr::mutate(
desired_x_value = case_when(
prefix == "a" ~ a_x,
prefix == "b" ~ b_x,
desired_y_values = case_when(
prefix == "a" ~a_y,
prefix == "b" ~b_y,
TRUE ~ NA_character_ ))
You can remove the columns you don't need anymore in a second step if you want. the code above results in the table:
a_x b_x a_y b_y prefix desired_x_value desired_y_values
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc c NA <NA>
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
You can write a helper function for this :
get_value <- function(data, prefix, group) {
data[cbind(1:nrow(data), match(paste(prefix, group, sep = '_'), names(data)))]
}
mydf %>%
mutate(desired_x_value = get_value(select(., ends_with('_x')), prefix, 'x'),
desired_y_value = get_value(select(., ends_with('_y')), prefix, 'y'))
# a_x b_x a_y b_y prefix desired_x_value desired_y_value
#1 1 8 k aa a 1 k
#2 2 9 b bb b 9 bb
#3 3 10 a cc c NA <NA>
#4 4 11 d dd a 4 d
#5 5 12 z ee a 5 z
A simple rowwise also works.
mydf %>% rowwise() %>%
mutate(desired_x = ifelse(any(str_detect(names(mydf)[-5], prefix)),
get(paste(prefix, 'x', sep = '_')), NA),
desired_y = ifelse(any(str_detect(names(mydf)[-5], prefix)),
get(paste(prefix, 'y', sep = '_')), NA))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x desired_y
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc c NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
If the prefixes don't contain any invalid column prefixes, this will do without ifelse statement.
mydf <- data.frame(a_x = c(1,2,3,4,5),
b_x = c(8,9,10,11,12),
a_y = c("k",'b','a','d','z'),
b_y = c('aa','bb','cc','dd','ee'),
prefix=c("a","b","a","a","a"))
mydf %>% rowwise() %>%
mutate(desired_x = get(paste(prefix, 'x', sep = '_')),
desired_y = get(paste(prefix, 'y', sep = '_')))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x desired_y
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc a 3 a
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
First I would like to say that I am not presenting this as a good solution as other proposed solutions are much better and simpler. However, since you have brought up get function, I wanted to show you how to make use of it to get your desired output. As a matter of fact some of the values in your prefix column such as c does not have a match among your column names and get function throws an error on terminating the execution, and unlike mget function it does not have a ifnotfound argument. So you need a way to go around that error message by means of an ifelse:
library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
library(glue)
mydf1 %>%
mutate(desired_x_value = map(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(glue("{.x}_x")), NA)),
desired_y_value = map(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(glue("{.x}_y")), NA))) %>%
unnest(cols = c(desired_x_value, desired_y_value))
# A tibble: 5 x 7
a_x b_x a_y b_y prefix desired_x_value desired_y_value
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc NA NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
You can also use paste function instead of glue and in case we already know the output types of the desired columns, we can spare the last line:
mydf1 %>%
mutate(desired_x_value = map_dbl(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(paste(.x, "x", sep = "_")), NA)),
desired_y_value = map_chr(prefix, ~ ifelse(any(str_detect(names(mydf)[-5], .x)),
get(paste(.x, "y", sep = "_")), NA)))
# A tibble: 5 x 7
# Rowwise:
a_x b_x a_y b_y prefix desired_x_value desired_y_value
<dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 1 8 k aa a 1 k
2 2 9 b bb b 9 bb
3 3 10 a cc NA NA NA
4 4 11 d dd a 4 d
5 5 12 z ee a 5 z
I have the following data frame:
library(dplyr)
library(tibble)
df <- tibble(
source = c("a", "b", "c", "d", "e"),
score = c(10, 5, NA, 3, NA ) )
df
It looks like this:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10 . # current max value
2 b 5
3 c NA
4 d 3
5 e NA
What I want to do is to replace NA in score column with values ranging for existing max + n onwards. Where n range from 1 to total number of rows of the df
Resulting in this (hand-coded) :
source score
a 10
b 5
c 11 # obtained from 10 + 1
d 3
e 12 # obtained from 10 + 2
How can I achieve that?
Another option :
transform(df, score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
# source score
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
If you want to do this in dplyr
library(dplyr)
df %>% mutate(score = pmin(max(score, na.rm = TRUE) +
cumsum(is.na(score)), score, na.rm = TRUE))
A base R solution
df$score[is.na(df$score)] <- seq(which(is.na(df$score))) + max(df$score,na.rm = TRUE)
such that
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Here is a dplyr approach,
df %>%
mutate(score = replace(score,
is.na(score),
(max(score, na.rm = TRUE) + (cumsum(is.na(score))))[is.na(score)])
)
which gives,
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
With dplyr:
library(dplyr)
df %>%
mutate_at("score", ~ ifelse(is.na(.), max(., na.rm = TRUE) + cumsum(is.na(.)), .))
Result:
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
A dplyr solution.
df %>%
mutate(na_count = cumsum(is.na(score)),
score = ifelse(is.na(score), max(score, na.rm = TRUE) + na_count, score)) %>%
select(-na_count)
## A tibble: 5 x 2
# source score
# <chr> <dbl>
#1 a 10
#2 b 5
#3 c 11
#4 d 3
#5 e 12
Another one, quite similar to ThomasIsCoding's solution:
> df$score[is.na(df$score)]<-max(df$score, na.rm=T)+(1:sum(is.na(df$score)))
> df
# A tibble: 5 x 2
source score
<chr> <dbl>
1 a 10
2 b 5
3 c 11
4 d 3
5 e 12
Not quite elegant as compared to the base R solutions, but still possible:
library(data.table)
setDT(df)
max.score = df[, max(score, na.rm = TRUE)]
df[is.na(score), score :=(1:.N) + max.score]
Or in one line but a bit slower:
df[is.na(score), score := (1:.N) + df[, max(score, na.rm = TRUE)]]
df
source score
1: a 10
2: b 5
3: c 11
4: d 3
5: e 12
I am trying to find the best way to iterate through each column of a data frame, group by that column, and produce a summary.
Here is my attempt:
library(tidyverse)
data = data.frame(
a = sample(LETTERS[1:3], 100, replace=TRUE),
b = sample(LETTERS[1:8], 100, replace=TRUE),
c = sample(LETTERS[3:15], 100, replace=TRUE),
d = sample(LETTERS[16:26], 100, replace=TRUE),
value = rnorm(100)
)
myfunction <- function(x) {
groupVars <- select_if(x, is.factor) %>% colnames()
results <- list()
for(i in 1:length(groupVars)) {
results[[i]] <- x %>%
group_by_at(.vars = vars(groupVars[i])) %>%
summarise(
n = n()
)
}
return(results)
}
test <- myfunction(data)
The function returns:
[[1]]
# A tibble: 3 x 2
a n
<fct> <int>
1 A 37
2 B 34
3 C 29
...
...
...
My question is, is this the best way to do this? Is there a way to avoid using a for loop? Can I use purrr and map somehow to do this?
Thank you
An option is to use map
library(tidyverse)
map(data[1:4], ~data.frame(x = {{.x}}) %>% count(x))
#$a
## A tibble: 3 x 2
# x n
# <fct> <int>
#1 A 39
#2 B 32
#3 C 29
#
#$b
## A tibble: 8 x 2
# x n
# <fct> <int>
#1 A 14
#2 B 11
#3 C 16
#4 D 10
#5 E 12
#6 F 10
#7 G 13
#8 H 14
#...
The output is a list. Note that I have ignored the last column of data, as it doesn't seem to be relevant here.
If you want columns in the list data.frames to be named according to the columns from your original data, we can use imap
imap(data[1:4], ~tibble(!!.y := {{.x}}) %>% count(!!sym(.y)))
#$a
## A tibble: 3 x 2
# a n
# <fct> <int>
#1 A 23
#2 B 35
#3 C 42
#
#$b
## A tibble: 8 x 2
# b n
# <fct> <int>
#1 A 15
#2 B 10
#3 C 13
#4 D 5
#5 E 19
#6 F 9
#7 G 13
#8 H 16
#...
Or making use of tibble::enframe (thanks #camille)
imap(data[1:4], ~enframe(.x, value = .y) %>% count(!!sym(.y)))
You could reshape the data and group by both the column and the letter. This gives you one dataframe instead of a list of them, but you could get the list if you really want it with split.
set.seed(123)
library(tidyverse)
data = data.frame(
a = sample(LETTERS[1:3], 100, replace=TRUE),
b = sample(LETTERS[1:8], 100, replace=TRUE),
c = sample(LETTERS[3:15], 100, replace=TRUE),
d = sample(LETTERS[16:26], 100, replace=TRUE),
value = rnorm(100)
)
data %>%
pivot_longer(cols = -value, names_to = "column", values_to = "letter") %>%
group_by(column, letter) %>%
summarise(n = n())
#> # A tibble: 35 x 3
#> # Groups: column [4]
#> column letter n
#> <chr> <fct> <int>
#> 1 a A 33
#> 2 a B 32
#> 3 a C 35
#> 4 b A 8
#> 5 b B 11
#> 6 b C 12
#> 7 b D 14
#> 8 b E 8
#> 9 b F 17
#> 10 b G 16
#> # … with 25 more rows
Created on 2019-10-30 by the reprex package (v0.3.0)
You can simply call:
apply(data, 2,table)
You can drop the last list element if you want.
I have data:
rowID incidentID participant.type
1 1 A
2 1 B
3 2 A
4 3 A
5 3 B
6 3 C
7 4 B
8 4 C
And I would like to end up with:
rowID incident participant.type participant.type.1 participant.type.2
1 1 A B
2 2 A
3 3 A B C
4 4 B C
I tried the spread but can't achieve one line per incident; I don't think I have a way of creating a key-value pair so I wonder if there is some other method for doing this.
Before using spread(), you need to create a proper key argument.
df %>% select(-rowID) %>%
group_by(incidentID) %>%
mutate(id = 1:n()) %>%
spread(id, participant.type)
# incidentID `1` `2` `3`
# <int> <fct> <fct> <fct>
# 1 1 A B NA
# 2 2 A NA NA
# 3 3 A B C
# 4 4 B C NA
Since your grouping is based on the row order within the icidentID column. The following simple solution will also work.
It is just filtering the dataframe and then merging in the end.
It is probably not the best solution in terms of effective use of computing power, but it is easy to understand.
library(tidyverse)
df <-
tribble(
~rowID, ~incidentID, ~participant.type,
1, 1, "A",
2, 1, "B",
3, 2, "A",
4, 3, "A",
5, 3, "B",
6, 3, "C",
7, 4, "B",
8, 4, "C")
df_1 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==1)
df_2 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==2) %>%
rename(participant.type.1 = participant.type)
df_3 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==3) %>%
rename(participant.type.2 = participant.type)
full_join(df_1, full_join(df_2, df_3))
Result:
Joining, by = "incidentID"
Joining, by = "incidentID"
# A tibble: 4 x 4
# Groups: incidentID [?]
incidentID participant.type participant.type.1 participant.type.2
<dbl> <chr> <chr> <chr>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
Here's my solution:
df %>%
select(-rowID) %>%
group_by(incidentID) %>%
nest() %>%
mutate(data = map_chr(data, ~str_c(.x$participant.type, collapse = '_'))) %>%
separate(data, paste0('participant.type.', 0:2)) %>%
mutate_at(2:4, ~replace_na(.x, ''))
We can use reshape2::dcast for this
reshape2::dcast(df, insidentID ~ participant.type)
# insidentID A B C
# 1 1 <NA> B <NA>
# 2 8 <NA> B <NA>
# 3 12 <NA> <NA> C
# 4 16 A <NA> <NA>
# 5 24 <NA> B <NA>
# 6 27 <NA> B C
# 7 29 <NA> <NA> C
with the data
set.seed(123)
df <- data.frame(insidentID = sample(0:30, 8L, replace = TRUE),
participant.type = sample(LETTERS[1:3], 8L, replace = TRUE),
stringsAsFactors = FALSE)
df
# insidentID participant.type
# 1 8 B
# 2 24 B
# 3 12 C
# 4 27 B
# 5 29 C
# 6 1 B
# 7 16 A
# 8 27 C
The 'related question' link provided by #markus shows a variety of other solutions, including what appears to be the most concise in a tidyverse format:
df %>%
group_by(incidentID) %>%
mutate(rn = paste0("newcolumn",row_number())) %>%
spread(rn, participant.type)
gives:
incidentID newcolumn1 newcolumn2 newcolumn3
<int> <fct> <fct> <fct>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
A