Loop through columns and split fields automatically to new column - r

I have been comparing two data frames in R using a package called daff and this is the final table I get:
dput(df)
structure(list(v1 = c("Silva->Silva/Mark", "Brandon->Brandon/Livo", "Mango->Mango or Apple"),
v2 = c("James->James=Jacy","NA->Na/Jane", "Egg->Egg and Orange")),
class = "data.frame", row.names = c(NA, -3L))
The rows fields have ->(arrow) to mean the data was modified in that cell from previous data frame column to current dataframe value. Now from here I had to separate the columns with ->(arrow) separator so that I can have an old column and new changed column. This means I added a suffix_old and _New to new columns. I used this code and see the output:
setDT(df)
df1<- lapply(names(df), function(x) {
mDT <- df[, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_Old", "_New")))
}) %>% as.data.table()
OUTPUT
dput(df)
structure(list(v1_Old = c("Silva", "Brandon", "Mango"),
v1_New = c("Silva/Mark", "Brandon/Livo", "Mango or Apple"),
v2_Old = c("James","NA", "Egg"),
v2_New = c("James=Jacy","Na/Jane", "Egg and Orange")),
class = "data.frame", row.names = c(NA, -3L))
Now my next step is to compare every two columns which have _old and _new suffix to identify what was modified then split and store in new column called diff_v1 and diff_v2. This I did using this code (Realise I have to do this manually by creating different spliting code lines, this is tedious with over 20 separated columns):
df$diff_v1<- mapply(function(x, y) paste(setdiff(y, x), collapse = '| '), strsplit(df$v1_old, '\\||, | | -| \\+'), strsplit(df$v1_Name_new, '\\||, | | -| \\+'))
df$diff_v2<- mapply(function(x, y) paste(setdiff(y, x), collapse = '| '), strsplit(df$v2_old, '\\||, | | -| \\+'), strsplit(df$v2_new, '\\||, | | -| \\+'))
OUTPUT
dput(df)
structure(list(v1_Old = c("Silva", "Brandon", "Mango"),
v1_New = c("Silva/Mark", "Brandon/Livo", "Mango or Apple"),
diff_v1 = c("/Mark", "/Livo", "or Apple"),
v2_Old = c("James","NA", "Egg"),
v2_New = c("James=Jacy","Na/Jane", "Egg and Orange"),
diff_v2 = c("=Jacy","/Jane", "and Orange")),
class = "data.frame", row.names = c(NA, -3L))
My question is can I be able to loop through columns with _old and _new and create new column called diff_v1 and diff_v2 respectively without running code line by line since. I have multiple columns and they keep changing depending on dataframes I am comparing . Wanted to know How I can use code to automatically identify columns with _Old and _New suffix and split then create that new column after the two but should happen on each pair of columns.
Currently I have to go to the data frame, check columns with old and new then manually change in the code that is splitting and creating diff column

We could identify "Old" and "New" columns based on their name using grep. We can use str_remove which is vectorized over string and pattern to remove part of "Old" col which is present in "New" col to create new columns.
old_cols <- grep("Old$", names(df), value = TRUE)
new_cols <- grep("New$", names(df), value = TRUE)
df[sub("New$", "diff", new_cols)] <- Map(stringr::str_remove,
df[new_cols], df[old_cols])
To get the names in order, we can do
df <- df[order(sub("_.*", "", names(df)))]
df
# v1_Old v1_New v1_diff v2_Old v2_New v2_diff
#1 Silva Silva/Mark /Mark James James=Jacy =Jacy
#2 Brandon Brandon/Livo /Livo NA Na/Jane Na/Jane
#3 Mango Mango or Apple or Apple Egg Egg and Orange and Orange
Using tidyverse, we can do
library(tidyverse)
df %>%
bind_cols(map2(df %>% select(ends_with("New")),
df %>% select(ends_with("Old")), stringr::str_remove))

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

Joining two dataframes in R

Sorry if this is a super basic question but I've run into an issue while working on my R project. Basically I have two data frame objects, one which is a master list of genes and their level of expression in various patients and one which is only a single column in size. Then one with only a single column is a list of genes that fall under a specific subcategory of genes all of which are in the master list. I am trying to create a data frame where I have my specific subset of genes AND their expression across the different patients which is contained in the master list. I tried using the merge() function but only an empty dataframe was created.
Basically the code goes something like: new_dataframe <- merge(master_list, specific_gene_list, by = "gene"). I thought this code should look at my master list find all the genes in the specific list and then only take those genes and add the columns for patient expression, however my data frame is empty it creates a dataframe with all of the columns of the master list but no values filled in. Any help is greatly appreciated.
A visual example:
Master data frame
x: 1
y: 3
z : 4
w: 6
Specific data frame:
x
y
Desired data frame:
x: 1
y: 3
We can use regex_inner_join from fuzzyjoin
library(fuzzyjoin)
df3 <- regex_inner_join(df1, df2, by = 'gene') %>%
transmute(gene = gene.x)
df3
# gene
#1 x: 1
#2 y: 3
data
df1 <- structure(list(gene = c("x: 1", "y: 3", "z: 4", "w: 6")),
class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(gene = c("x", "y")), class = "data.frame", row.names = c(NA,
-2L))
You could also split the column by the colon and add a new column to merge the dataframes.
mergecol <- c("x: 1",
"y: 3",
"z: 4",
"w: 6")
df <- cbind(mergecol, as.data.frame(do.call(rbind, strsplit(mergecol, ':'))))
df2 <- data.frame(V1 = c('x', 'y'))
mergedf <- merge(df, df2, by="V1")
result <- c('x: 1', 'y: 3')
assertthat::are_equal(result, mergedf$mergecol)
#[1] TRUE
You can separate the columns in master_list using separate, join with specific_gene_list and again combine the columns with unite.
library(dplyr)
library(tidyr)
master_list %>%
separate(gene, c('gene', 'value'), sep = ':\\s*') %>%
inner_join(specific_gene_list, by = 'gene') %>%
unite(gene, gene, value, sep = " : ")
# gene
#1 x : 1
#2 y : 3

subset df according nested list while there is a white space

I have a data frame and I would like to subset it according specific values. When I have tried to do it, there is problem because of the white space inside the values in sample_df$mentions.
I used this script for subsetting the data frame:
sample_list <- list()
for (i in colnames(sample_name)){
sample_list <- sapply(sample_df$mentions, function(x)any(x %in% sample_name[[i]]))
new_sample_df <- sample_df[sample_list,]
}
I have tried strsplit function to get rid of the space but it has created other problems.
sample_df$mentions <- strsplit(as.charater(sample_df$mentions),"[[:space:]]")
Thank you for your help in advance.
My expected outcome should be like this:
mentions screen_name
5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
sample_name reproducible data:
sample_name <- structure(list(Name = structure(2:1, .Label = c("hamzayerlikaya",
"SSSBBL777"), class = "factor")), row.names = c(NA, -2L), class = "data.frame")
sample_df reproducible data:
sample_df <- structure(list(mentions = list(character(0), "srgnsnmz92", character(0),
"Berivan_Aslan_", c("islambey1453", " hamzayerlikaya", " tahaayhan",
" hidoturkoglu15"), character(0), "themarginale", character(0),
character(0), c("nurhandnci", " SSSBBL777", " serkanacar007",
" Chequevera06", " kubilayy81")), screen_name = c("SaadetYakar",
"beraydogru", "EL_Turco_DLC", "hebunagel", "ak_Furkan54", "zaferakyol011",
"melmitem", "mobbingabla", "BekarKronik", "tanrica_gaia")), row.names = c(NA,
10L), class = "data.frame")
We can loop through the 'Name' and use that in grepl, Reduce it to a single logical vector and subset the rows of 'sample_df'
sample_df[Reduce(`|`, lapply(as.character(sample_name$Name),
grepl, x = sample_df$mentions)),]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
NOTE: This would work with any length of 'Name' column
Another option is regex_inner_join
library(fuzzyjoin)
library(tidyverse)
regex_inner_join(sample_df, sample_name, by = c("mentions" = "Name")) %>%
select(mentions, screen_name)
# mentions screen_name
#1 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#2 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
Since mentions is a list we can use sapply and select only those rows in sample_df where any of the mentions has Name in it.
sample_df[sapply(sample_df$mentions, function(x) any(grepl(pattern, x))), ]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
where pattern is
pattern = paste0("\\b", sample_name$Name, "\\b", collapse = "|")

removing sublists from a list

I have list of 155 elements, eahc contain 3 lists.
below I made an small example. I am only interested in keeping values in gene and am trying in R to remove first and second list of each element all at once! leaving me only values in gene.
test <- list(name="Adipose", desc= "Roche", gene = c("KRT14", "RPE65"))
test1 <- list(name="muscle", desc= "Roche", gene = c("THRSP", "KRT14"))
test2 <- list(name="WBC" , desc= "Roche", gene = c("RBP4", "CCDC80"))
x <- c(test,test1, test2)
How to achieve that?
As shown by the dput you posted in the comments, your actual data structure is a list of lists. In this case, you can use an lapply to get what you want:
list <- structure(list(Adipose = structure(list(name = "Adipose", desc = "Roche", genes = c("ACACB", "ACP5", "ACTA1")), .Names = c("name", "desc", "genes")), WBC = structure(list( name = "WBC ", desc = "Roche", genes = c("THRSP", "KRT14", "APOB", "LEP")), .Names = c("name", "desc", "genes"))), .Names = c("Adipose ", "WBC "))
lapply(list, function(x) x[names(x)=="genes"])
#$`Adipose `
#$`Adipose `$genes
#[1] "ACACB" "ACP5" "ACTA1"
#
#$`WBC `
#$`WBC `$genes
#[1] "THRSP" "KRT14" "APOB" "LEP"

select elements within one column according to a value from another column

I am a new learner of R. Currently, I am working on some infinium 450k data. I have some data like this:
IlmnID | RefGene_Location | RefGene_Name
------------- | ---------------------------------------------------------------| ----------------------------------------------
cg27656579 | Body;5'UTR;5'UTR;5'UTR | MIR5096;GNG4;GNG4;GNG4
cg03503114 | TSS1500;1stExon;1stExon;5'UTR;1stExon;5'UTR;5'UTR;5'UTR;1stExon| CAPZA1;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L
The elements in RefGene_Location columns are corresponding to the genes in RefGene_Name columns. What I want is retaining genes that correspond to "5'UTR", "1stExon" and "TSS1500", but not "Body". Finally, compile them into
results like this:
IlmnID | RefGene_Name
----------- | ------------
cg27656579 | GNG4
cg03503114 | CAPZA1
cg03503114 | ST7L
You can use tidyverse package. First split and unnest the columns. Filter and remove duplicates.
library(tidyverse)
df %>%
mutate_at(vars(-ID), funs(strsplit(., ';'))) %>%
unnest() %>%
filter(loc1 %in% v1) %>%
select(-loc1) %>%
unique()
Which gives,
ID name1
1 A GNG4
4 B CAPZA1
5 B ST7L
Where,
v1 <- c("5'UTR", "1stExon" ,"TSS1500")
df <- structure(list(ID = c("A", "B"), loc1 = c("Body;5'UTR;5'UTR;5'UTR",
"TSS1500;1stExon;1stExon;5'UTR;1stExon;5'UTR;5'UTR;5'UTR;1stExon"
), name1 = c("MIR5096;GNG4;GNG4;GNG4", "CAPZA1;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L;ST7L"
)), .Names = c("ID", "loc1", "name1"), row.names = c(NA, -2L), class = "data.frame")
To do this, we're going to try three simple stages.
Stage 1: row selection
First, we'll select rows in the dataframe where RefGene_Location contains all three phrases 5'UTR, 1stExon and TSS1500, but not the phrase Body. I'll suppose the dataframe you're working with is called `df
df <- df[all(c("5'UTR", "1stExond", "TSS1500") %in% df$RefGene_Location) &
!"Body" %in% df$RefGene_Location, ]
Stage 2: gene names
Now we want to grab the gene names from the RefGene_Name column. I'm going to assume that the name always occurs in the text after the last ; in that column, and replace the RefGene_Name column with this entry.
# Split df$RefGene_Name into chunks separated by ";", and keep the last chunk
df$RefGene_Name <- sapply(strsplit(df$RefGene_Name, ";"), function(x) x[length(x)])
Stage 3: get rid of excess columns
This is just so the data is in the same format as you gave in your question. There are other columns in your dataset we no longer need, like RefGene_Location, so we'll remove these.
# Keep the two columns "IlmnID" and "RefGene_Name"
df <- df[, c("IlmnID", "RefGene_Name")]

Resources