separate a string separated by ; into columns in R

separate a string separated by ; into columns in R - r

I am trying to use dplyr to separate a column into multiple columns
here is the column:
name
1 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5 tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True
dput(x) [makes reproducible]
structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1",
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True",
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
I want to get only exon_rank of 1 and have it turned into columns
What I would like to do is turn it into the following
tx_id tx_name gene_id exon_id exon_name exon_rank
1 1 XM_017916188.1 LOC108556273 3 id1 1
2 7 XM_017913854.1 LOC108557084 61 id6 1
3 2 XM_017927872.1 LOC108564750 4 id8 1
I've been trying to use
x %>%
separate()
but it gets stuck in situations where tx_id=8,9 vs tx_id=1
any help?
thank you

Here is an option with tidyverse
library(dplyr)
library(tidyr)
library(data.table)
x %>%
mutate(name = as.character(name)) %>%
separate_rows(name, sep=";") %>%
separate(name, into = c('key', 'value'), sep="=") %>%
mutate(rn = rowid(key)) %>%
pivot_wider(names_from = key, values_from = value) %>%
type.convert(as.is = TRUE) %>%
filter(exon_rank == 1)

You could write a function that reads that type of data:
read_data <- function(data){
read_data_row <- function(x){
u <- read.dcf(textConnection(x))
v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
.direction = 'downup')
}
plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}
Now read the data:
new_data <- read_data(x$name)
new_data
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
2 2 XM_017927872.1 LOC108564750 13 id17 10 <NA>
3 2 XM_017927872.1 LOC108564750 14 id18 11 <NA>
4 2 XM_017927872.1 LOC108564750 15 id19 12 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
You can subset it the way you want:
subset(new_data, exon_rank==1)
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True

Related

Splitting the data in brackets in R

Hi I have a dataset that looks like this
PTNUM
AGE1_2
AGE2_3
AGE3_2
12345
(23,35)
NA
NA
12346
NA
(23,28,34,44)
(45,50)
12347
(17,22)
NA
(38,45)
I would like to have the output looking like this
PTNUM
AGE1_1
AGE1_2
AGE2_2
AGE2_3
AGE3_3
AGE3_2
12345
23
35
NA
NA
NA
NA
12346
NA
NA
23
28
NA
NA
12346
NA
NA
34
44
45
50
12347
17
22
NA
NA
38
45
I tried this code in R just to try splitting AGE1_2 to AGE1_1 and AGE1_2 but this resulted in all the rows of AGE1_1 and AGE1_2 being NA's.
ZX_1_2 <- extract(ZX, AGE1_2, into = c('AGE1_1', 'AGE1_2'),
regex = "(.?) \((.?)\)")
Could someone help me get the expected result?

We could use
library(purrr)
library(tidyr)
library(stringr)
map_dfc(names(ZX)[-1], ~ df1 %>%
select(all_of(.x)) %>%
extract(1, into = str_c(names(.), "_", 1:2),
"\\((\\d+),(\\d+)\\)", convert = TRUE)) %>%
bind_cols(ZX['PTNUM'], .)
-output
PTNUM AGE1_2_1 AGE1_2_2 AGE2_3_1 AGE2_3_2 AGE3_2_1 AGE3_2_2
1 12345 23 35 NA NA NA NA
2 12346 NA NA 34 40 45 50
3 12347 17 22 NA NA 38 45
Or another option is
ZX %>%
mutate(across(starts_with('AGE'),
~ read.csv(text = str_remove_all(.x, "\\(|\\)"),
header = FALSE, fill = TRUE))) %>%
unpack(where(is.data.frame), names_sep = "_")
-output
# A tibble: 3 × 7
PTNUM AGE1_2_V1 AGE1_2_V2 AGE2_3_V1 AGE2_3_V2 AGE3_2_V1 AGE3_2_V2
<int> <int> <int> <int> <int> <int> <int>
1 12345 23 35 NA NA NA NA
2 12346 NA NA 34 40 45 50
3 12347 17 22 NA NA 38 45
For the updated data
library(data.table)
ZX2 %>%
pivot_longer(cols = starts_with("AGE")) %>%
mutate(value = str_remove_all(value, "\\(|\\)")) %>%
separate_rows(value, sep = ",") %>%
group_by(PTNUM, name) %>%
mutate(rn = as.integer(gl(n(), 2, n()))) %>%
ungroup %>%
mutate(rn2 = rowid(PTNUM, name, rn)) %>%
unite(name, name, rn2) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-rn) %>%
group_by(PTNUM) %>%
mutate(across(everything(), ~ .x[order(!is.na(.x))])) %>%
ungroup
-output
# A tibble: 4 × 7
PTNUM AGE1_2_1 AGE1_2_2 AGE2_3_1 AGE3_2_1 AGE2_3_2 AGE3_2_2
<int> <chr> <chr> <chr> <chr> <chr> <chr>
1 12345 23 35 <NA> <NA> <NA> <NA>
2 12346 <NA> <NA> 23 <NA> 28 <NA>
3 12346 <NA> <NA> 34 45 44 50
4 12347 17 22 <NA> 38 <NA> 45
data
ZX <- structure(list(PTNUM = 12345:12347, AGE1_2 = c("(23,35)", NA,
"(17,22)"), AGE2_3 = c(NA, "(34,40)", NA), AGE3_2 = c(NA, "(45,50)",
"(38,45)")), class = "data.frame", row.names = c(NA, -3L))
ZX2 <- structure(list(PTNUM = 12345:12347, AGE1_2 = c("(23,35)", NA,
"(17,22)"), AGE2_3 = c(NA, "(23,28,34,44)", NA), AGE3_2 = c(NA,
"(45,50)", "(38,45)")), class = "data.frame", row.names = c(NA,
-3L))

how to delete one of two duplicates in each column and merge them in r

I have a data which consists of two columns and some duplicates on these columns. I want to remove duplicates for each column and then gather all unique values keeping column names.
data<-structure(c(10L, 10L, 11L, 11L, 5L, 5L, 3L, 5L), .Dim = c(2L,
4L), .Dimnames = list(c("d1", "m1"), c("year2036", "year2037",
"year2038", "year2039")))
year2036 year2037 year2038 year2039
d1 10 11 5 3
m1 10 11 5 5
And the output will be like:
year2036 year2037 year2038 year2039 year2039
10 11 5 3 5
out<-structure(c(10, 11, 5, 3, 5), .Names = c("year2036", "year2037",
"year2038", "year2039", "year2039"))
I tried unique(r[c(1:8)]) but it is just giving unique numbers removing column names.

You can use unique in apply and stack the result.
stack(apply(data, 2, unique))
# values ind
#1 10 year2036
#2 11 year2037
#3 5 year2038
#4 3 year2039
#5 5 year2039
Or in the format you wanted:
x <- stack(apply(data, 2, unique))
setNames(x$values, x$ind)
#year2036 year2037 year2038 year2039 year2039
# 10 11 5 3 5

data %>%
as_tibble() %>%
pivot_longer(everything()) %>%
group_by(name) %>%
distinct(value)
# A tibble: 5 x 2
# Groups: name [4]
name value
<chr> <int>
1 year2036 10
2 year2037 11
3 year2038 5
4 year2039 3
5 year2039 5

It is not a good practice to have data with same column names. Here is a solution which gives same structure as your expected output but with modified
column names.
library(dplyr)
library(tidyr)
data %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
distinct() %>%
mutate(row = data.table::rowid(name)) %>%
pivot_wider(names_from = c(name, row), values_from = value)
# year2036_1 year2037_1 year2038_1 year2039_1 year2039_2
# <int> <int> <int> <int> <int>
#1 10 11 5 3 5

Using dapply from collapse
library(collapse)
stack(dapply(data, MARGIN = 2, FUN = funique))
values ind
1 10 year2036
2 11 year2037
3 5 year2038
4 3 year2039
5 5 <NA>

Subset dataframe based on Values in second dataframe

I have one dataframe, df, that has two columns as such:
> head(df1[,c(10,11)])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
I have a second dataframe, also with 2 columns with matching names. Instead, there are only two numbers, as such:
> head(df2)
ColA ColB
1 50 30
I want to subset values from df1 based on the value in the corresponding column from df2 . Doing this manually would look like this:
colA_vector <- df1[df1$colA < 50,]
colB_vector <- df1[df1$ColB < 30,]
How can I do so in a more general purpose way? I do not want to hardcode anything. The column name "ColA" or "ColB" could be anything (so solutions requiring those column names won't really work).
Thank you.

In base R we could do:
nms <- intersect(names(df1), names(df2))
df1[do.call(`&`, Map(`<`, df1[nms], df2[nms])),]
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
Or just df1[do.call('&', Map('<', df1, df2)),] if both data.frames have the same order of columns and same names.
Using the package fuzzyjoin might be more readable however:
library(fuzzy_join)
fuzzy_semi_join(df1, df2, match_fun = `<`)
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
data
df1 <- read.table(text="
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4",h=T,strin=F)
df2 <- read.table(text="ColA ColB
1 50 30",h=T,strin=F)

Create a function if we want to do the same task repeatedly
f1 <- function(dat1, dat2, colName) {
dat1[dat1[[colName]] < dat2[[colName]],]
}
f1(df1, df2, "ColA")
# ColA ColB
#1 12 20
#2 7 5
#3 32 38
#4 37 46
#5 15 15
#6 4 4
f1(df1, df2, "ColB")
# ColA ColB
#1 12 20
#2 7 5
#5 15 15
#6 4 4
data
df1 <- structure(list(ColA = c(12L, 7L, 32L, 37L, 15L, 4L), ColB = c(20L,
5L, 38L, 46L, 15L, 4L)), class = "data.frame", row.names = c(NA,
-6L))
df2 <- structure(list(ColA = 50L, ColB = 30L),
class = "data.frame", row.names = "1")

Using dplyr:
df1 %>%
filter(df1[,1] < df2[,1])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
df1 %>%
filter(df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4
Subsetting based on both columns simultaneously:
df1 %>%
filter(df1[,1] < df2[,1] & df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4

If you don't want to use the fuzzyjoin join package or make your own function, you can just repeat the second dataframe.
df1 <- data.frame("ColA" = c(12, 7, 32),
"ColB" = c(20, 5, 38))
df2 <- data.frame("ColA" = 50,
"ColB" = 30)
n <- nrow(df1)
df2_new <- do.call("rbind", replicate(n, df2, simplify = FALSE))
df1_which <- as.data.frame(df1 < df2_new)
colA_vector <- df1[df1_which$ColA, "ColA"]
colB_vector <- df1[df1_which$ColB, "ColB"]

You can try a tidyverse funtion. Result is a list of the filtered data.frames.
foo <- function(x, y, ColA, ColB){
require(tidyverse)
var1 <- quo_name(ColA)
var2 <- quo_name(ColB)
x %>%
select(a=!!var1, b=!!var2) %>%
mutate(colA_vector= a < y[[ColA]]) %>%
mutate(colB_vector= b < y[[ColB]]) %>%
gather(k, v, -a, -b) %>%
filter(v) %>%
split(.$k) %>%
map(~select(.,-v,-k))
}
foo(df1, df2, "ColA", "ColB")
$colA_vector
a b
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
$colB_vector
a b
7 12 20
8 7 5
9 15 15
10 4 4

Reshaping complicating data-set in R

I have a strange dataset format where a simple reshape function won't work. Assume I have three time periods (1-3); 2 id Names (A-B); and three variables (X,Y and Z) in the following format. Where the id names and variables name are seperated by -:
Time A-X A-Y A-Z B-X B-Y B-Z
1 2 4 5 6 1 2
2 2 3 2 3 2 3
3 4 4 4 4 4 4
Ideally, I would like to produce the dataset in the following format:
ID Time X Y Z
A 1 2 4 5
A 2 2 3 2
A 3 4 4 4
B 1 6 1 2
B 2 3 2 3
B 3 4 4 4
Which functions to use?

library(dplyr)
library(tidyr)
library(splitstackshape)
df %>%
gather(key, value, -Time) %>%
cSplit("key", sep="_") %>%
spread(key_2, value) %>%
rename(ID = key_1) %>%
arrange(ID, Time)
Output is:
Time ID X Y Z
1 1 A 2 4 5
2 2 A 2 3 2
3 3 A 4 4 4
4 1 B 6 1 2
5 2 B 3 2 3
6 3 B 4 4 4
Sample data:
df <- structure(list(Time = 1:3, A_X = c(2L, 2L, 4L), A_Y = c(4L, 3L,
4L), A_Z = c(5L, 2L, 4L), B_X = c(6L, 3L, 4L), B_Y = c(1L, 2L,
4L), B_Z = 2:4), .Names = c("Time", "A_X", "A_Y", "A_Z", "B_X",
"B_Y", "B_Z"), class = "data.frame", row.names = c(NA, -3L))

Here is another dplyr and tidyr solution.
df %>%
gather(ID, value, -Time) %>%
separate(ID, into = c("ID", "var")) %>%
spread(var, value) %>%
arrange(ID) %>%
select(ID, Time, X, Y, Z)
# ID Time X Y Z
# 1 A 1 2 4 5
# 2 A 2 2 3 2
# 3 A 3 4 4 4
# 4 B 1 6 1 2
# 5 B 2 3 2 3
# 6 B 3 4 4 4

find only the group members in r

I'm stuck with defining group members to an individual. I was working in excel but that is failing since the number of individuals in a group varies between groups. I used this formula
=IFERROR(INDEX($A$1:$A$10727;SMALL(IF($S$1:$S$10727=$S2;ROW($S$1:$S$10727);"");Nth);1);"NA")
This returns the Nth individual in a group. This is not working since gives me all the individuals and I only want the group member, so not the individuals itself. So I was thinking to go to R, but I don't know where to start.
My data looks like this:
group ID
1 1
1 2
1 3
2 4
2 5
3 6
3 7
3 8
3 9
3 10
I would like this:
group ID gm1 gm2 gm3 gm4
1 1 2 3 NA NA
1 2 1 3 NA NA
1 3 1 2 NA NA
2 4 5 NA NA NA
2 5 4 NA NA NA
3 6 7 8 9 10
3 7 6 8 9 10
3 8 6 7 9 10
3 9 6 7 8 10
3 10 6 7 8 9
Is there a formula in R that gives me the group members?

We can do this with combn and cSplit
library(splitstackshape)
df1$gm <- unlist(unsplit(lapply(split(df1$ID, df1$group), function(x)
lapply(x, function(y) {
i1 <- x[y!= x]
if(length(i1) >1) combn(i1, length(i1), FUN = paste, collapse=", ") else i1
})), df1$group))
cSplit(df1, 'gm', ', ')
# group ID gm_1 gm_2 gm_3 gm_4
# 1: 1 1 2 3 NA NA
# 2: 1 2 1 3 NA NA
# 3: 1 3 1 2 NA NA
# 4: 2 4 5 NA NA NA
# 5: 2 5 4 NA NA NA
# 6: 3 6 7 8 9 10
# 7: 3 7 6 8 9 10
# 8: 3 8 6 7 9 10
# 9: 3 9 6 7 8 10
#10: 3 10 6 7 8 9
Or the same can be implemented with data.table and cSplit
library(data.table)
cSplit(setDT(df1)[, gm := unlist(lapply(seq_len(.N), function(i) {
i1 <- ID[i != seq_len(.N)]
if(length(i1) > 1) combn(i1, length(i1), FUN =paste, collapse=", ")
else as.character(i1)})), group], 'gm', ', ')
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L,
3L), ID = 1:10), .Names = c("group", "ID"), class = "data.frame", row.names = c(NA,
-10L))

Using dplyr and tidyr you could solve this in the following way. First we define a function that solves the problem for a single group, then we simply apply this function to all the groups using do.
library(dplyr)
df <- data.frame(group = rep(1:3, c(3, 2, 5)), ID = 1:10)
add_group_members <- function(df) {
df_copy <- df
colnames(df_copy)[2] <- "gm_id"
inner_join(df, df_copy, by = c("group" = "group")) %>%
filter(ID != gm_id) %>%
group_by(ID) %>%
mutate(gm = paste("gm", row_number(), sep = '')) %>%
tidyr::spread(key = gm, value = gm_id) %>% ungroup
}
df %>% group_by(group) %>% do(add_group_members(.)) %>% ungroup

Another tidyverse solution:
df <- data.frame(x = rep(1:3, c(3, 2, 5)), id = 1:10)
library(tidyverse)
df2 <-
df %>%
group_by(x) %>%
mutate(unique = paste(unique(id), collapse = ","))
df2$group_unique <- map_chr(seq_len(nrow(df2)), function(index) {
row_unique <- as.numeric(strsplit(df2[[index, "unique"]], ",")[[1]])
paste0(setdiff(row_unique, df2[[index, "id"]]), collapse = ",")
})
df2 %>%
select(-unique) %>%
separate(group_unique, paste("gm_", 1:(max(table(df$x)) - 1)))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

separate a string separated by ; into columns in R - r

Related

Splitting the data in brackets in R

how to delete one of two duplicates in each column and merge them in r

Subset dataframe based on Values in second dataframe

Reshaping complicating data-set in R

find only the group members in r

Categories

Resources