removing sublists from a list - r

I have list of 155 elements, eahc contain 3 lists.
below I made an small example. I am only interested in keeping values in gene and am trying in R to remove first and second list of each element all at once! leaving me only values in gene.
test <- list(name="Adipose", desc= "Roche", gene = c("KRT14", "RPE65"))
test1 <- list(name="muscle", desc= "Roche", gene = c("THRSP", "KRT14"))
test2 <- list(name="WBC" , desc= "Roche", gene = c("RBP4", "CCDC80"))
x <- c(test,test1, test2)
How to achieve that?

As shown by the dput you posted in the comments, your actual data structure is a list of lists. In this case, you can use an lapply to get what you want:
list <- structure(list(Adipose = structure(list(name = "Adipose", desc = "Roche", genes = c("ACACB", "ACP5", "ACTA1")), .Names = c("name", "desc", "genes")), WBC = structure(list( name = "WBC ", desc = "Roche", genes = c("THRSP", "KRT14", "APOB", "LEP")), .Names = c("name", "desc", "genes"))), .Names = c("Adipose ", "WBC "))
lapply(list, function(x) x[names(x)=="genes"])
#$`Adipose `
#$`Adipose `$genes
#[1] "ACACB" "ACP5" "ACTA1"
#
#$`WBC `
#$`WBC `$genes
#[1] "THRSP" "KRT14" "APOB" "LEP"

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

How to rbind multiple dataframes with a while-loop?

I'm trying to rbind multiple loaded datasets (all of them have the same num. of columns, named "num", "source" and "target"). In case, I have ten dataframes, which names are "test1", "test2", "test3" and so on...
I thought that trying the solution below (creating an empty dataframe and looping through the others) would solve my problem, but I guess that I'm missing something in the second argument of the rbind function. I don't know if the solution using paste0("test", I) to increment the variable (changing the name of the dataframe) it's correct... I'm afraid that I'm just trying to rbind a dataframe with a string object (and getting an error), is that right?
test = as.data.frame(matrix(ncol = 3, nrow = 0)) %>%
setNames(c("num", "source", "target"))
i=1
while (i < 11) {
test = rbind(test, paste0("test", i))
i = i + 1
}
We need replicate to return as a list
out <- setNames(replicate(10, test, simplify = FALSE),
paste0("test", seq_len(10)))
If there are multiple datasets already created in the global env, get those in to a list and rbind within do.call
out <- do.call(rbind, mget(paste0("test", 1:10)))
We could bind test1:test10 using the common pattern in the name:
library(dplyr)
result <- mget(ls(pattern="^test\\d+")) %>%
bind_rows()
If I understood correctly, this might help you
Libraries
library(dplyr)
Example data
list_of_df <-
list(
df1 = data.frame(a = "1"),
df2 = data.frame(a = "2"),
df3 = data.frame(a = "1"),
df4 = data.frame(a = "2")
)
Code
bind_rows(list_of_df,.id = "dataset")
Result
dataset a
1 df1 1
2 df2 2
3 df3 1
4 df4 2

subset df according nested list while there is a white space

I have a data frame and I would like to subset it according specific values. When I have tried to do it, there is problem because of the white space inside the values in sample_df$mentions.
I used this script for subsetting the data frame:
sample_list <- list()
for (i in colnames(sample_name)){
sample_list <- sapply(sample_df$mentions, function(x)any(x %in% sample_name[[i]]))
new_sample_df <- sample_df[sample_list,]
}
I have tried strsplit function to get rid of the space but it has created other problems.
sample_df$mentions <- strsplit(as.charater(sample_df$mentions),"[[:space:]]")
Thank you for your help in advance.
My expected outcome should be like this:
mentions screen_name
5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
sample_name reproducible data:
sample_name <- structure(list(Name = structure(2:1, .Label = c("hamzayerlikaya",
"SSSBBL777"), class = "factor")), row.names = c(NA, -2L), class = "data.frame")
sample_df reproducible data:
sample_df <- structure(list(mentions = list(character(0), "srgnsnmz92", character(0),
"Berivan_Aslan_", c("islambey1453", " hamzayerlikaya", " tahaayhan",
" hidoturkoglu15"), character(0), "themarginale", character(0),
character(0), c("nurhandnci", " SSSBBL777", " serkanacar007",
" Chequevera06", " kubilayy81")), screen_name = c("SaadetYakar",
"beraydogru", "EL_Turco_DLC", "hebunagel", "ak_Furkan54", "zaferakyol011",
"melmitem", "mobbingabla", "BekarKronik", "tanrica_gaia")), row.names = c(NA,
10L), class = "data.frame")
We can loop through the 'Name' and use that in grepl, Reduce it to a single logical vector and subset the rows of 'sample_df'
sample_df[Reduce(`|`, lapply(as.character(sample_name$Name),
grepl, x = sample_df$mentions)),]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
NOTE: This would work with any length of 'Name' column
Another option is regex_inner_join
library(fuzzyjoin)
library(tidyverse)
regex_inner_join(sample_df, sample_name, by = c("mentions" = "Name")) %>%
select(mentions, screen_name)
# mentions screen_name
#1 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#2 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
Since mentions is a list we can use sapply and select only those rows in sample_df where any of the mentions has Name in it.
sample_df[sapply(sample_df$mentions, function(x) any(grepl(pattern, x))), ]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
where pattern is
pattern = paste0("\\b", sample_name$Name, "\\b", collapse = "|")

Removing the special symbols in data.frame column values

I have two data frame each with a column Name
df1:
name
#one2
!iftwo
there_2_go
come&go
df1 = structure(list(name = c("#one2", "!iftwo", "there_2_go", "come&go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
df2:
name
One2
IfTwo#
there-2-go
come.go
df2 = structure(list(name = c("One2", "IfTwo#", "there-2-go", "come.go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
Now to compare the two data frames for inequality is cumbersome because of special symbols using %in%. To remove the special symbols using stringR can be useful. But how exactly we can use stringR functions with %in% and display the mismatch between them
have already done the mutate() to convert all in lowercasestoLower()as follows
df1<-mutate(df1,name=tolower(df1$name))
df2<-mutate(df2,name=tolower(df2$name))
Current output of comparison:
df2[!(df2 %in% df1),]
[1] "one2" "iftwo#" "there-2-go" "come.go"
Expected output as essentially the contents are same but with special symbols:
df2[!(df2 %in% df1),]
character(0)
Question : How do we ignore the symbols in the contents of the Frame
Here it is in a function,
f1 <- function(df1, df2){
i1 <- tolower(gsub('[[:punct:]]', '', df1$name))
i2 <- tolower(gsub('[[:punct:]]', '', df2$name))
d1 <- sapply(i1, function(i) grepl(paste(i2, collapse = '|'), i))
return(!d1)
}
f1(df, df2)
# one2 iftwo there2go comego
# FALSE FALSE FALSE FALSE
#or use it for indexing,
df2[f1(df, df2),]
#character(0)

R: Building a list from matching values in a data.frame

I have a 3 column data frame which looks a little like this:
id name links
1 134235 dave "34657","34563","23459"
2 23459 mary "134235","45868","45677"
3 165432 jane "134235","23459","44657"
where id and name values are unique, and links is a string of ids which indicate an association with some of the names in each row. So for example dave includes the links id 23459 which is mary so dave is connected to mary. What I need to produce is a pair list of all the connections in the data so with the example data I would output something like:
dave,mary
mary,dave
jane,dave
jane,mary
Very new to R and seen amazing things done with methods like apply and before going off and trying to replicate a solution which would look more like a javascript routine and be very inefficient I wondered if anyone could help.
One solution, using Matt's dput():
tab <- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
conns <- function(name, links) {
paste(name, tab$name[tab$id %in% as.numeric(unlist(strsplit(gsub('\'|\"',
'', links), ',')))], sep=',')
}
connections <- unname(unlist(mapply(conns, tab$name, tab$links,
SIMPLIFY=FALSE)))
The first step should be to normalize the data, in particular, parse the strings.
You can use ddply: it applies a function
that takes a chunk of a data.frame (a row, in our case)
and transforms it in some way. You just have to write a function
that works on one row, i.e., on one string.
# Sample data
n <- 10
k <- 3
ids <- as.character(unique(round(1e5*runif(n))))
n <- length(ids)
names <- LETTERS[1:n]
links <- lapply( ids, function(u)
sample(setdiff(ids,u),k,replace=FALSE) )
links <- sapply( links, function(u)
paste( '"', paste(u,collapse='","'), '"', sep="" ) )
d <- data.frame(
id=ids,
name=names,
links=links,
stringsAsFactors=FALSE
)
library(plyr)
library(stringr)
dd <- ddply(
d,
c("id", "name"),
function(u) data.frame(
id=u$id,
name=u$name,
link=unlist(str_split( str_replace_all( u$links, '"', '' ), "," ))
))
You can then join the data, either with merge or sqldf.
library(sqldf)
sqldf("
SELECT A.name, B.name
FROM dd AS A, d AS B
WHERE A.link = B.id
")
dat<- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
# It can all be done in base, of course...
library(stringr)
library(reshape2)
# This would be easy to do if links weren't in that format -
# one record per id-link pair would be preferable.
# Split dat$links and remove any quotes
dat.wider <- data.frame(
dat[ , c("id", "name")],
str_split_fixed(string = gsub(dat$links,
pattern = "['|\"]",
replace = ""),
pattern = ", ",
n = 3)
)
# Reshape
dat.long <- melt(dat.wider, id.var = c("id", "name"))
# Self-join - this is not quite the right method, but I'm just not
# thinking straight right now
dat.joined <- unique(merge(x = dat.long[ , c("name", "value")],
y = dat.long[ , c("id", "name")],
by.x = "value",
by.y = "id"
))
# And, finally, if you wanted vector output...
res <- with(dat.joined, paste(name.x, name.y, sep = ", "))

Resources