Coerce data.frame to list by row - r

Starting with a data.frame such as:
df = read.table(text = "ref1 code1,code2
ref2 code3,code4,code5
ref3 code6", stringsAsFactors=F)
names(df) = c('id', 'codes')
print(df)
id codes
1 ref1 code1,code2
2 ref2 code3,code4,code5
3 ref3 code6
wishing for an outcome something like this:
lst = list()
for(i in 1:3) lst[[df[i,1]]] = strsplit(df[i,2], ',')[[1]]
print(lst)
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
How might it be possible to get to this point without (slow) iteration? as.list(df) only works by column:
$id
[1] "ref1" "ref2" "ref3"
$codes
[1] "code1,code2" "code3,code4,code5" "code6"
Thanks in advance.

Something like this, perhaps:
lapply(split(df$codes,df$id),function(x) strsplit(x,split = ",")[[1]])
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
Ananda's solution mentioned below is IMHO far superior:
setNames(strsplit(df$codes, ","), df$id)

You may also try this
library(splitstackshape)
ll <- concat.split.list(data = df,
split.col = "codes",
drop = TRUE)[[2]]
names(ll) <- df$id
ll
# $ref1
# [1] "code1" "code2"
#
# $ref2
# [1] "code3" "code4" "code5"
#
# $ref3
# [1] "code6
Update following #Ananda Mahto's comment. Thanks!
setNames(concat.split.list(df, "codes")[["codes_list"]], df$id)

Here's another approach.
> lst <- unlist(apply(df[,2, drop=FALSE], 1, strsplit, ","), recursive=FALSE)
> names(lst) <- df[,1]
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
Also using setNames for naming the list as in #Henrik's answer
> setNames(unlist(apply(df[,2, drop=FALSE], 1, strsplit, ","), recursive=FALSE), df$id)

Related

Using attribute tables to apply a function between specific elements in lists

I have two list objects. l1 contains information that has been read in through path files. l2 is a list of values that have similar name components as those in l1. I have assigned attributes to both list based on the names of the elements in the list. I would like to reach my expected results using the attributes that I have assigned to my list.
For example: I would like to apply a function mean() between the elements with the attribute id that are "2013_mean" in l1 to those with the attribute year that are also "2013" in l2. I would like to do the similar thing with those when the attribute for year is "2016".
# File List
oldl1 <- list(2,3,4,5)
names(oldl1) <- c("C:/Users/2013_mean.csv",
"C:/Users/2013_median.csv",
"C:/Users/2016_mean.csv",
"C:/Users/2016_median.csv"
)
newl1 <- list(2,3,4,5,8,9)
names(newl1) <- c("C:/Users/2013_mean.csv",
"C:/Users/2013_median.csv",
"C:/Users/2016_mean.csv",
"C:/Users/2016_median.csv",
"C:/Users/2017_mean.csv",
"C:/Users/2017_median.csv"
)
attributes(l1) <- data.frame(id = sub("\\.csv", "", basename(names(l1))),
year = trimws(basename(names(l1)), whitespace = "_.*"))
# Other List
l2 <- list(8,9,10,15,1)
names(l2) <- c("2013_A",
"2013_B",
"2013_C",
"2016_D",
"2016_E")
attributes(l2) <- data.frame(year = trimws(names(l2), whitespace = "_.*"))
expected <- list(mean(c(l1[[1]], l2[[1]])),
mean(c(l1[[1]], l2[[2]])),
mean(c(l1[[1]], l2[[3]])),
mean(c(l1[[3]], l2[[4]])),
mean(c(l1[[3]], l2[[5]]))
)
We may use the attributes to split and match and get the mean
yrs <- intersect(attr(l1, "year"), attr(l2, "year"))
i1 <- grepl("mean", attr(l1, "id"))
i12 <- attr(l1, "year") %in% yrs
i1 <- i1 & i12
i2 <- attr(l2, "year") %in% yrs
l2new <- l2[i2]
l1new <- l1[i1]
attr(l1new, "year") <- attr(l1, "year")[i1]
out <- do.call(c, Map(function(x, y) lapply(x, function(z)
mean(c(z, y))), split(l2new, attr(l2, 'year')[i2]), l1new))
names(out) <- NULL
-checking with OP's expected
> identical(out, expected)
[1] TRUE
Or another option is to convert the list with attributes to a data.frame, do a merge and use rowMeans and then convert to list with as.list
as.list(rowMeans(merge(transform(data.frame(attributes(l2)),
l2 = unlist(l2)),
subset(transform(data.frame(attributes(l1)), l1 = unlist(l1)),
grepl("mean", id), select = c(year, l1)), all.x = TRUE)[-1]))
-output
[[1]]
[1] 5
[[2]]
[1] 5.5
[[3]]
[1] 6
[[4]]
[1] 9.5
[[5]]
[1] 2.5

How to iterate entries in a function to create two new character vectors

I am struggling to separate a single string input into a series of inputs. The user gives a list of FASTA formatted sequences (see example below). I'm able to separate the inputs into their own
ex:
">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
.>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
"
[1] "Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2] "Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
But I am struggling to find a way to create a function that splits the "Rosalind_6404" from the gene sequence to the unknown amount of FASTA sequences while creating new vectors for the split elements.
Ultimately, the result would look something such as:
.> "Rosalind_6404" "Rosalind5959"
.> "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG","CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
I was hoping the convert_entries function would allow me to iterate over all the elements of the prepped_s character vector and split the elements into two new vectors with the same index number.
s <- ">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC"
split_s <- strsplit(s, ">")
ul_split_s<- unlist(split_s)
fixed_s <- gsub("\n","", ul_split_s)
prepped_s <- fixed_s[-1]
prepped_s
nchar(prepped_s[2])
print(prepped_s[2])
entry_tags <- list()
entry_seqs <- list()
entries <- length(prepped_s)
unlist(entries)
first <- prepped_s[1]
convert_entries <- function() {
for (i in entries) {
tag <- substr(prepped_s[i], start = 1, stop = 13)
entry_tags <- append(entry_tags, tag)
return(entry_tags)
}
}
entry_tags <- convert_entries()
print(entry_tags)
Please help in any way you can, thanks!
One option with tidyverse
library(dplyr)
library(tidyr)
library(stringr)
tibble(col1 = s) %>%
separate_rows(col1, sep="\n") %>%
group_by(grp = cumsum(str_detect(col1, '^>'))) %>%
summarise(prefix = first(col1),
col1 = str_c(col1[-1], collapse=""), .groups = 'drop') %>%
select(-grp)
-output
# A tibble: 2 x 2
prefix col1
<chr> <chr>
1 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
Using seqinr package:
library(seqinr)
# example fasta file
write(">Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC", "myFile.fasta")
# read the fasta file
x <- read.fasta("myFile.fasta", as.string = TRUE, forceDNAtolower = FALSE)
# get the names
names(x)
# [1] "Rosalind_6404" "Rosalind_5959"
# get the seq
x$Rosalind_6404
# [1] "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
# attr(,"name")
# [1] "Rosalind_6404"
# attr(,"Annot")
# [1] ">Rosalind_6404"
# attr(,"class")
# [1] "SeqFastadna"
In base R you could do:
t(gsub('\n', '', regmatches(s, gregexec("([A-Z][a-z_0-9]+)\n([A-Z\n]+)", s))[[1]][-1,]))
[,1] [,2]
[1,] "Rosalind_6404" "CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG"
[2,] "Rosalind_5959" "CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC"
NOTE: I transposed the matrix so that you may vie the results. Ignore the use of t function
Another base R solution:
read.table(text=sub('\n', ' ', gsub('(\\D)\n', '\\1', unlist(strsplit(s, '>')))))
V1 V2
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC
or even
proto <- data.frame(name = character(), value = character())
new_s <- gsub('\n', '', unlist(strsplit(s, '>')))
strcapture("([A-Z][a-z_0-9]+)([A-Z]+)", grep('\\w', new_s, value = T), proto)
name value
1 Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG
2 Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC

How to elegantly split array of strings in list elements by substring?

How can i elegantly split an Array of strings in subgroups, based on their first character?
Sample data:
c("1_autoa", "1_autob", "1_autoc","2_bier", "3_hundx", "3_hundy")
Desired Output:
[[1]]
[1] "1_autoa" "1_autob" "1_autoc"
[[2]]
[1] "2_bier"
[[3]]
[1] "3_hundx" "3_hundy"
list(
c("1_autoa", "1_autob", "1_autoc"), c("2_bier"), c("3_hundx", "3_hundy"))
What i tried: (Working example, but seems unnessary Long)
library(dplyr)
library(purrr)
library(magrittr)
data <- data.frame(
id = 1:6,
name = c("1_autoa", "1_autob", "1_autoc", "2_bier", "3_hundx", "3_hundy")
)
data$start <- substr(x = data$name, start = 1, stop = 1)
spread(data, start, name) %>%
apply(MARGIN = 2, list) %>%
lapply(FUN = function(x) x[[1]][!is.na(x[[1]])])
Simply
split(x, gsub('\\D+', '', x))
where,
x <- c("1_autoa", "1_autob", "1_autoc","2_bier", "3_hundx", "3_hundy")
v <- c("1_autoa", "1_autob", "1_autoc","2_bier", "3_hundx", "3_hundy")
sapply(1:3, function(i) v[which(sapply(v, function(x) grepl(as.character(i), x)))])
[[1]]
[1] "1_autoa" "1_autob" "1_autoc"
[[2]]
[1] "2_bier"
[[3]]
[1] "3_hundx" "3_hundy"

Extract subset of string in dataframe column

I have one of the columns in the data frame as follows. Need to get the output as shown.
Data :
NM_001104633|0|Sema3d|-
NM_0011042|0|XYZ|-
NM_0956|0|ghd|+
Required output :
Sema3d
XYZ
ghd
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
sub(".*0\\|(.*)\\|[+|-]", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(strsplit(x, "\\|"), function(s) s[3])
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(x, function(s){
inds = gregexpr("\\|", s)[[1]]
substring(s, inds[2] + 1, inds[3] - 1)
},
USE.NAMES = FALSE)
#[1] "Sema3d" "XYZ" "ghd"
We can use read.table to separate them in different columns and then select only the one which we are interested in.
read.table(text = df$V1, sep = "|")
# V1 V2 V3 V4
#1 NM_001104633 0 Sema3d -
#2 NM_0011042 0 XYZ -
#3 NM_0956 0 ghd +
We can also use tidyr::separate for this
tidyr::separate(df, V1, into = paste0("col1", 1:4), sep = "\\|")
Or cSplit from splitstackshape
splitstackshape::cSplit(df, "V1", sep = "|")
data
df <- structure(list(V1 = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-",
"NM_0956|0|ghd|+")), class = "data.frame", row.names = c(NA, -3L))
The following regex takes all text between the last pair of | followed by a + or a -.
([^\|]*)(?=\|(\+|-))
Demo
We can use sub from base R
sub(".*\\|(\\w+)\\|[-+]$", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
Or using gsub
gsub(".*\\d+\\||\\|.*", "", x)
#[1] "Sema3d" "XYZ" "ghd"
data
x <- c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
The package unglue offers a readable alternative, if not as efficient :
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
unglue::unglue_vec(x, "{drop1}|0|{keep}|{drop2}",var = "keep")
#> [1] "Sema3d" "XYZ" "ghd"
# or
unglue::unglue_vec(x, "{=.*?}|0|{keep}|{=.*?}")
#> [1] "Sema3d" "XYZ" "ghd"
Or in the data frame directly :
df <- data.frame(col = x)
unglue::unglue_unnest(df, col, "{=.*?}|0|{new_col}|{=.*?}")
#> new_col
#> 1 Sema3d
#> 2 XYZ
#> 3 ghd

Convert data.frame to list of lists

I am trying to figure out how to convert a data.frame to a list of lists. Suppose I had (feel free to modify this if you need to capture more attributes for later):
v <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A"),
row2=list(col1 = as.Date("2012-03-03"), col2="B"))
d <- do.call(rbind, lapply(v, as.data.frame))
d$col3 <- 2
How do I get d back to a list of lists (similar to v). The end result should be equivalent to the result of:
vr <- list(
row1=list(col1 = as.Date("2011-01-23"), col2="A", col3=2),
row2=list(col1 = as.Date("2012-03-03"), col2="B", col3=2))
You can do
out <- lapply(split(d, rownames(d)), as.list)
out
#$row1
#$row1$col1
#[1] "2011-01-23"
#$row1$col2
#[1] "A"
#$row1$col3
#[1] 2
#$row2
#$row2$col1
#[1] "2012-03-03"
#$row2$col2
#[1] "B"
#$row2$col3
#[1] 2
If you add stringsAsFactors = FALSE when creating d, i.e.
d <- do.call(rbind, lapply(v, as.data.frame, stringsAsFactors = FALSE))
d$col3 <- 2
then
identical(out, vr)
returns TRUE.
You have to go through the columns again making them lists before you pass them as values of the element of the main list. I hope the below code helps:
apply(d,MARGIN = 1, FUN=function(x){as.list(x)})

Resources