Splitting the data in brackets in R - r

Hi I have a dataset that looks like this
PTNUM
AGE1_2
AGE2_3
AGE3_2
12345
(23,35)
NA
NA
12346
NA
(23,28,34,44)
(45,50)
12347
(17,22)
NA
(38,45)
I would like to have the output looking like this
PTNUM
AGE1_1
AGE1_2
AGE2_2
AGE2_3
AGE3_3
AGE3_2
12345
23
35
NA
NA
NA
NA
12346
NA
NA
23
28
NA
NA
12346
NA
NA
34
44
45
50
12347
17
22
NA
NA
38
45
I tried this code in R just to try splitting AGE1_2 to AGE1_1 and AGE1_2 but this resulted in all the rows of AGE1_1 and AGE1_2 being NA's.
ZX_1_2 <- extract(ZX, AGE1_2, into = c('AGE1_1', 'AGE1_2'),
regex = "(.?) \((.?)\)")
Could someone help me get the expected result?

We could use
library(purrr)
library(tidyr)
library(stringr)
map_dfc(names(ZX)[-1], ~ df1 %>%
select(all_of(.x)) %>%
extract(1, into = str_c(names(.), "_", 1:2),
"\\((\\d+),(\\d+)\\)", convert = TRUE)) %>%
bind_cols(ZX['PTNUM'], .)
-output
PTNUM AGE1_2_1 AGE1_2_2 AGE2_3_1 AGE2_3_2 AGE3_2_1 AGE3_2_2
1 12345 23 35 NA NA NA NA
2 12346 NA NA 34 40 45 50
3 12347 17 22 NA NA 38 45
Or another option is
ZX %>%
mutate(across(starts_with('AGE'),
~ read.csv(text = str_remove_all(.x, "\\(|\\)"),
header = FALSE, fill = TRUE))) %>%
unpack(where(is.data.frame), names_sep = "_")
-output
# A tibble: 3 × 7
PTNUM AGE1_2_V1 AGE1_2_V2 AGE2_3_V1 AGE2_3_V2 AGE3_2_V1 AGE3_2_V2
<int> <int> <int> <int> <int> <int> <int>
1 12345 23 35 NA NA NA NA
2 12346 NA NA 34 40 45 50
3 12347 17 22 NA NA 38 45
For the updated data
library(data.table)
ZX2 %>%
pivot_longer(cols = starts_with("AGE")) %>%
mutate(value = str_remove_all(value, "\\(|\\)")) %>%
separate_rows(value, sep = ",") %>%
group_by(PTNUM, name) %>%
mutate(rn = as.integer(gl(n(), 2, n()))) %>%
ungroup %>%
mutate(rn2 = rowid(PTNUM, name, rn)) %>%
unite(name, name, rn2) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-rn) %>%
group_by(PTNUM) %>%
mutate(across(everything(), ~ .x[order(!is.na(.x))])) %>%
ungroup
-output
# A tibble: 4 × 7
PTNUM AGE1_2_1 AGE1_2_2 AGE2_3_1 AGE3_2_1 AGE2_3_2 AGE3_2_2
<int> <chr> <chr> <chr> <chr> <chr> <chr>
1 12345 23 35 <NA> <NA> <NA> <NA>
2 12346 <NA> <NA> 23 <NA> 28 <NA>
3 12346 <NA> <NA> 34 45 44 50
4 12347 17 22 <NA> 38 <NA> 45
data
ZX <- structure(list(PTNUM = 12345:12347, AGE1_2 = c("(23,35)", NA,
"(17,22)"), AGE2_3 = c(NA, "(34,40)", NA), AGE3_2 = c(NA, "(45,50)",
"(38,45)")), class = "data.frame", row.names = c(NA, -3L))
ZX2 <- structure(list(PTNUM = 12345:12347, AGE1_2 = c("(23,35)", NA,
"(17,22)"), AGE2_3 = c(NA, "(23,28,34,44)", NA), AGE3_2 = c(NA,
"(45,50)", "(38,45)")), class = "data.frame", row.names = c(NA,
-3L))

Related

How do I pivot_wider a char column?

I'm trying to pivot_wider a tibble of random alpha strings
stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = row_number(),
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
I get three columns of NA in a tibble (252 x 5):
# A tibble: 252 × 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 aarup NA NA NA
2 2 abhir NA NA NA
3 3 afpgt NA NA NA
4 4 apjts NA NA NA
5 5 arlst NA NA NA
6 6 awkjn NA NA NA
7 7 babro NA NA NA
8 8 bbrpn NA NA NA
9 9 bbrzt NA NA NA
10 10 bedzs NA NA NA
# … with 242 more rows
instead of the desired 63 x 5.
your id-column is messing everything up. rownumbers are unique, so casting to wide does not make sense, since you have got unique identifiers.
try something like
stringi::stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = rep(1:(length(value) / 4), 4), # !! <-- !!
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
# A tibble: 63 x 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 ababk glynv mottj tqcbv
2 2 abysq gmfhc mujcw twjix
3 3 aerkp godcs mycak tzqny
4 4 agtoa gpler naetp ucuvg
5 5 ahebl grqgz nfali ufbqv
6 6 amdvv gswwu nhmnu ulgup
7 7 apgut gvkwh nkcks umwih
8 8 atgxy gynef nkklm uojxc
9 9 bcklx hcdup nngfz upfhx
10 10 bcnxz hcpzy nnvpd uqlgs
# ... with 53 more rows

Pivot data frame to longer twice in R then back to original shape

Let's say I want to know which of four basketball players is the best and I set up a little tournament where two players play each other 1 vs 1 and I record a set of stats
#rm(list=ls())
set.seed(1234)
# some made up scores from my tournament
df <- data.frame(
player1 = c("a", "a", "b", "c", "d", "d"),
player2 = c("b", "c", "d", "b", "a", "c"),
date = c("2021-01-01", "2021-01-02", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-08"),
p1_dunks = sample(c(4:11), 6, replace = TRUE),
p2_dunks = sample(c(3:12), 6, replace = TRUE),
p1_blocks = sample(c(8:10), 6, replace = TRUE),
p2_blocks = sample(c(10:12), 6, replace = TRUE),
p1_threepointers = sample(c(2:7), 6, replace = TRUE),
p2_threepointers = sample(c(1:5), 6, replace = TRUE)
)
In order to calculate how well a player has been doing by any point of the tournament, I can pivot this to longer twice and replace the count of each stat with the cumulative sum of each count
# cast to long and get cumulative stats per player
melted_df <- df %>%
pivot_longer(cols = starts_with(c("p1", "p2")), names_to = "stat", values_to = "number") %>%
pivot_longer(cols = starts_with("player"), names_to = "player", values_to = "name") %>%
filter(
(player == "player1" & grepl("^p1", stat)) |
(player == "player2" & grepl("^p2", stat))
) %>%
arrange(date) %>%
group_by(player, stat) %>%
mutate(number = cumsum(number))
then I can query against this easily enough
melted_df %>%
filter(date < "2021-01-05") %>%
filter(!duplicated(name)) %>%
filter(grepl("dunks$", stat))
but let's say for my use case I need to then coerce this long format data back to its original form (with player1, player2, then the stats for each player 1 and player 2). I can try
# try to cast back to original format
back_to_wider_df <- melted_df %>%
pivot_wider(names_from = "player", values_from = "name") %>%
pivot_wider(names_from = "stat", values_from = "number")
but this instead gives a data frame which is 'offset' per match with a row half full of NA values:
> head(back_to_wider_df)
# A tibble: 6 × 9
date player1 player2 p1_dunks p1_blocks p1_threepointers p2_dunks p2_blocks p2_threepointers
<chr> <chr> <chr> <int> <int> <int> <int> <int> <int>
1 2021-01-01 a NA 7 9 6 NA NA NA
2 2021-01-01 NA b NA NA NA 11 11 4
3 2021-01-02 a NA 18 18 9 NA NA NA
4 2021-01-02 NA c NA NA NA 18 22 8
5 2021-01-04 b NA 23 27 15 NA NA NA
6 2021-01-04 NA d NA NA NA 26 32 11
Is there a simple way to fix this back to the original shape such that the first three rows should read:
> df
date player1 player2 p1_dunks p1_blocks p1_threepointers p2_dunks p2_blocks p2_three_pointers
1 2021-01-01 a b 7 9 6 11 11 4
2 2021-01-02 a c 18 18 9 18 22 8
3 2021-01-04 b d 23 27 15 26 32 11
thanks,
One way could be using lead function and the removing the NA
library(dplyr)
df %>%
mutate(across(c(player2, p2_dunks, p2_blocks, p2_threepointers), lead)) %>%
na.omit()
date player1 player2 p1_dunks p1_blocks p1_threepointers p2_dunks p2_blocks p2_threepointers
1 2021-01-01 a b 7 9 6 11 11 4
3 2021-01-02 a c 18 18 9 18 22 8
5 2021-01-04 b d 23 27 15 26 32 11

separate a string separated by ; into columns in R

I am trying to use dplyr to separate a column into multiple columns
here is the column:
name
1 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5 tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True
dput(x) [makes reproducible]
structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1",
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True",
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
I want to get only exon_rank of 1 and have it turned into columns
What I would like to do is turn it into the following
tx_id tx_name gene_id exon_id exon_name exon_rank
1 1 XM_017916188.1 LOC108556273 3 id1 1
2 7 XM_017913854.1 LOC108557084 61 id6 1
3 2 XM_017927872.1 LOC108564750 4 id8 1
I've been trying to use
x %>%
separate()
but it gets stuck in situations where tx_id=8,9 vs tx_id=1
any help?
thank you
Here is an option with tidyverse
library(dplyr)
library(tidyr)
library(data.table)
x %>%
mutate(name = as.character(name)) %>%
separate_rows(name, sep=";") %>%
separate(name, into = c('key', 'value'), sep="=") %>%
mutate(rn = rowid(key)) %>%
pivot_wider(names_from = key, values_from = value) %>%
type.convert(as.is = TRUE) %>%
filter(exon_rank == 1)
You could write a function that reads that type of data:
read_data <- function(data){
read_data_row <- function(x){
u <- read.dcf(textConnection(x))
v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
.direction = 'downup')
}
plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}
Now read the data:
new_data <- read_data(x$name)
new_data
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
2 2 XM_017927872.1 LOC108564750 13 id17 10 <NA>
3 2 XM_017927872.1 LOC108564750 14 id18 11 <NA>
4 2 XM_017927872.1 LOC108564750 15 id19 12 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
You can subset it the way you want:
subset(new_data, exon_rank==1)
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True

Index multiple vectors into table in R

I have three vectors:
position <- c(13, 13, 24, 20, 24, 6, 13)
my_string_allele <- c("T>A", "T>A", "G>C", "C>A", "A>G", "A>G", "G>T")
position_ref <- c("12006", "1108", "13807", "1970", "9030", "2222", "4434")
I want to create a table (starting from the smallest position) as shown below. I want to account for the number of occurrence for each my_string_allele column for each position and have their corresponding position_ref in position_ref column. What would be the simplest way to do this?
position T>A position_ref G>C position_ref C>A position_ref A>G position_ref G>T position_ref
6 1 2222
13 2 12006, 1108 1 4434
20 1 1970
24 1 13807 1 9030
Here is a spread() method which stretches data to the wide format with mutate_all() to count the number of occurrences.
Data
library(tidyverse)
df <- data.frame(position, my_string_allele, position_ref, stringsAsFactors = F)
Code
df %>% group_by(position, my_string_allele) %>%
mutate(position_ref = paste(position_ref, collapse = ", ")) %>%
distinct() %>%
spread(my_string_allele, position_ref) %>%
mutate_all(funs(N = if_else(is.na(.), NA_integer_, lengths(str_split(., ", ")))))
Output
position `A>G` `C>A` `G>C` `G>T` `T>A` `A>G_N` `C>A_N` `G>C_N` `G>T_N` `T>A_N`
<dbl> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
1 6 2222 NA NA NA NA 1 NA NA NA NA
2 13 NA NA NA 4434 12006, 1108 NA NA NA 1 2
3 20 NA 1970 NA NA NA NA 1 NA NA NA
4 24 9030 NA 13807 NA NA 1 NA 1 NA NA
(You can sort the columns by their column names to get the output you show in the question.)
Full disclosure: I am adapting part of #DarrenTsai's answer with data.table to provide the number of occurrence as well (since it is missing from his answer). Using data.table:
library(data.table)
df <- data.frame(position, my_string_allele, position_ref, stringsAsFactors = F)
setDT(df)
df[, `:=`(position_ref = paste(.N, paste(position_ref, collapse = ", "))),
by = c("position", "my_string_allele")] %>%
unique(., by = c("position", "my_string_allele", "position_ref")) %>%
dcast(position ~ my_string_allele, value.var = "position_ref")
Result:
position A>G C>A G>C G>T T>A
1: 6 1 2222 <NA> <NA> <NA> <NA>
2: 13 <NA> <NA> <NA> 1 4434 2 12006, 1108
3: 20 <NA> 1 1970 <NA> <NA> <NA>
4: 24 1 9030 <NA> 1 13807 <NA> <NA>
With dplyr (largely based on #DarrenTsai's answer, should upvote his as well):
library(dplyr)
df %>% group_by(position, my_string_allele) %>%
mutate(position_ref = paste(n(), paste(position_ref, collapse = ", "))) %>%
distinct() %>%
tidyr::spread(my_string_allele, position_ref)

regex (in gathering multiple sets of columns with tidyr)

inspired by hadley's nifty gather approach in this answer I tried to use tidyr's gather() and spread() in combination with a regular expression, regex, but I seem to get it wrong on the regex.
I did study several regex questions; this one, this one, and also at regex101.com. I tried to circumvent the regex by using starts_with(), ends_with() and matches() inspired by this question, but with no luck.
I am asking here in the hope that someone can show where I get it wrong and I can solve it, preferably using, the select helpers from tidyselect.
I need to select 2 regex-groups one up to the last . and one consisting of what comes after the last ., I made this two example below, one where my code s working and one where I am stuck.
First the example that is working,
# install.packages(c("tidyverse"), dependencies = TRUE)
require(tidyverse)
The first data set, that work, looks like this,
myData1 <- tibble(
id = 1:10,
Wage.1997.1 = c(NA, 32:38, NA, NA),
Wage.1997.2 = c(NA, 12:18, NA, NA),
Wage.1998.1 = c(NA, 42:48, NA, NA),
Wage.1998.2 = c(NA, 2:8, NA, NA),
Wage.1998.3 = c(NA, 42:48, NA, NA),
Job.Type.1997.1 = NA,
Job.Type.1997.2 = c(NA, rep(c('A', 'B'), 4), NA),
Job.Type.1998.1 = c(NA, rep(c('A', 'B'), 4), NA),
Job.Type.1998.2 = c(NA, rep(c('A', 'B'), 4), NA)
)
and this is how I gather() it,
myData1 %>% gather(key, value, -id) %>%
extract(col = key, into = c("variable", "id.job"), regex = "(.*?\\..*?)\\.(.)$") %>%
spread(variable, value)
#> # A tibble: 30 x 6
#> id id.job Job.Type.1997 Job.Type.1998 Wage.1997 Wage.1998
#> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 1 1 <NA> <NA> <NA> <NA>
#> 2 1 2 <NA> <NA> <NA> <NA>
#> 3 1 3 <NA> <NA> <NA> <NA>
#> 4 2 1 <NA> A 32 42
#> 5 2 2 A A 12 2
#> 6 2 3 <NA> <NA> <NA> 42
#> 7 3 1 <NA> B 33 43
#> 8 3 2 B B 13 3
#> 9 3 3 <NA> <NA> <NA> 43
#> 10 4 1 <NA> A 34 44
#> # ... with 20 more rows
It works, I suspect I overdoing it with the regex, but it works. However, my real data can have either one or two digest at the end, i.e.
The second data, where I get stuck,
myData2 <- tibble(
id = 1:10,
Wage.1997.1 = c(NA, 32:38, NA, NA),
Wage.1997.12 = c(NA, 12:18, NA, NA),
Wage.1998.1 = c(NA, 42:48, NA, NA),
Wage.1998.12 = c(NA, 2:8, NA, NA),
Wage.1998.13 = c(NA, 42:48, NA, NA),
Job.Type.1997.1 = NA,
Job.Type.1997.12 = c(NA, rep(c('A', 'B'), 4), NA),
Job.Type.1998.1 = c(NA, rep(c('A', 'B'), 4), NA),
Job.Type.1998.12 = c(NA, rep(c('A', 'B'), 4), NA)
)
Now, this is where I use (0[0-1]|1[0-9])$ for the second group, I also tried thing like \d{1}|\d{2}, but did that not work either.
myData2 %>% gather(key, value, -id) %>%
extract(col = key, into = c("variable", "id.job"),
regex = "(.*?\\..*?)\\.(0[0-1]|1[0-9])$") %>%
spread(variable, value)
The expected output would be something like this,
#> # A tibble: 30 x 6
#> id id.job Job.Type.1997 Job.Type.1998 Wage.1997 Wage.1998
#> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 1 1 <NA> <NA> <NA> <NA>
#> 2 1 12 <NA> <NA> <NA> <NA>
#> 3 1 13 <NA> <NA> <NA> <NA>
#> 4 2 1 <NA> A 32 42
#> 5 2 12 A A 12 2
#> 6 2 13 <NA> <NA> <NA> 42
#> 7 3 1 <NA> B 33 43
#> 8 3 12 B B 13 3
#> 9 3 13 <NA> <NA> <NA> 43
#> 10 4 1 <NA> A 34 44
#> # ... with 20 more rows
A simply solution à la t this question using select helpers like starts_with(), ends_with(), matches(), etc. would be appreciated.
We can change the regex in extract to match characters and capture as group ((.*)) from the start (^) of the string followed by a dot (\\.) and one or more characters that are not a dot captured as a group (([^.]+)) till the end ($) of the string
myData2 %>%
gather(key, value, -id) %>%
extract(col = key, into = c("variable", "id.job"), "^(.*)\\.([^.]+)$") %>%
spread(variable, value)
# A tibble: 30 x 6
# id id.job Job.Type.1997 Job.Type.1998 Wage.1997 Wage.1998
# * <int> <chr> <chr> <chr> <chr> <chr>
# 1 1 1 <NA> <NA> <NA> <NA>
# 2 1 12 <NA> <NA> <NA> <NA>
# 3 1 13 <NA> <NA> <NA> <NA>
# 4 2 1 <NA> A 32 42
# 5 2 12 A A 12 2
# 6 2 13 <NA> <NA> <NA> 42
# 7 3 1 <NA> B 33 43
# 8 3 12 B B 13 3
# 9 3 13 <NA> <NA> <NA> 43
#10 4 1 <NA> A 34 44
# ... with 20 more rows

Resources