Select unique values - r

I need to change this function that doesn't match for unique values. For example, if I want MAPK4, the function matches MAPK41 and AMAPK4 etc. The function must select only the unique values.
Function:
library(dplyr)
df2 <- df %>%
rowwise() %>%
mutate(mutated = paste(mutated_genes[unlist(
lapply(mutated_genes, function(x) grepl(x,genes, ignore.case = T)))], collapse=","),
circuit_name = gsub("", "", circuit_name)) %>%
select(-genes) %>%
data.frame()
data:
df <-structure(list(circuit_name = c("hsa04010__117", "hsa04014__118" ), genes = c("MAP4K4,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP3*,DUSP3*,DUSP3*,DUSP3*,PPM1A,AKT3,AKT3,AKT3,ZAK,MAP3K12,MAP3K13,TRAF2,CASP3,IL1R1,IL1R1,TNFRSF1A,IL1A,IL1A,TNF,RAC1,RAC1,RAC1,RAC1,MAP2K7,MAPK8,MAPK8,MAPK8,MECOM,HSPA1A,HSPA1A,HSPA1A,HSPA1A,HSPA1A,HSPA1A,MAP4K3,MAPK8IP2,MAP4K1", "MAP4K4,DUSP10*,DUSP10*,DUSP10*,DUSP10*,DUSP10*")), class = "data.frame", row.names = c(NA, -2L))
mutated_genes <- c("MAP4K4", "MAP3K12","TRAF2", "CACNG3")
output:
circuit_name mutated
1 hsa04010__117 MAP4K4,TRAF2
2 hsa04014__118 MAP4K4

A base R approach would be by splitting the genes on "," and return those string which match mutated_genes.
df$mutated <- sapply(strsplit(df$genes, ","), function(x)
toString(grep(paste0(mutated_genes, collapse = "|"), x, value = TRUE)))
df[c(1, 3)]
# circuit_name mutated
#1 hsa04010__117 MAP4K4, MAP3K12, TRAF2
#2 hsa04014__118 MAP4K4

Please note that based on the mutated_genes vector, your expected output is missing MAP3K12 for hsa04010__117.
Here is a tidyverse possibility
df %>%
separate_rows(genes) %>%
filter(genes %in% mutated_genes) %>%
group_by(circuit_name) %>%
summarise(mutated = toString(genes))
## A tibble: 2 x 2
# circuit_name mutated
# <chr> <chr>
#1 hsa04010__117 MAP4K4, MAP3K12, TRAF2
#2 hsa04014__118 MAP4K4
Explanation: We separate comma-separated entries into different rows, then select only those rows where genes %in% mutated_genes and summarise results per circuit_name by concatenating genes entries.
PS. Personally I'd recommend keeping the data in a tidy long format (i.e. don't concatenate entries with toString); that way you have one row per gene, which will make any post-processing of the data much more straightforward.

We can use str_extract
library(stringr)
df$mutated <- sapply(str_extract_all(df$genes, paste(mutated_genes,
collapse="|")), toString)

Related

R code to merge 2 data frames by whether values in the first "by" variable contain string values in the second "by" variable

I have 2 data frames: one with a list of medications, the other with a different but highly overlapping list of medications along with corresponding medication ID codes. I want to merge these two data frames to apply the medication codes to the first data frame's medication list. I have a lot of partial string matches, and I want to detect strings in a case-insensitive manner.
library(tidyverse)
library(stringr)
label <- c("0.4% Lidocaine Hydrochloride", "10% Dextrose", "Act Raloxifene")
df1 <- as.DataFrame(label)
label2 <- c("LIDOCAINE", "RALOXIFENE", "JANUMET", "ESOMEPRAZOLE", "METFORMIN")
code <- c(0003, 0005, 0006, 0001, 0011)
df2 <- data.frame(label2, code)%>%
rename(label=label2)
I try to use str_detect from stringr package
merge_df <- merge(df1, df2,
by.x=c("label" = ifelse(str_detect(df1$label, regex(df2$label, ignore_case = T)),
df1$label, NA)),
by.y=c("label" = ifelse(str_detect(df1$label, regex(df2$label, ignore_case = T)),
df2$label, NA)),
ignore.case=T,all.x=T,all.y=T,
suffixes = c("_list", "_dict"),
nomatch=0)
And I get the error:
Error in str_detect():
! Can't recycle string (size 3) to match pattern (size 5).
An approach using left_join.
First add a variable l_lower in both sets containing all tolower strings, separated by strsplit to enable match of all entries.
After joining and arranging the y-labels remove duplicated entries and the helper column.
library(dplyr)
library(tidyr)
left_join(df1 %>%
rowwise() %>%
mutate(l_label = strsplit(tolower(label), " ")) %>%
unnest(l_label),
df2 %>%
rowwise() %>%
mutate(l_label = unlist(strsplit(tolower(label), " "))), "l_label") %>%
arrange(label.y) %>%
group_by(label.x) %>%
filter(!duplicated(label.x)) %>%
select(-l_label) %>%
ungroup()
# A tibble: 3 × 3
label.x label.y code
<chr> <chr> <dbl>
1 0.4% Lidocaine Hydrochloride LIDOCAINE 3
2 Act Raloxifene RALOXIFENE 5
3 10% Dextrose NA NA
Data
df1 <- structure(list(label = c("0.4% Lidocaine Hydrochloride", "10% Dextrose",
"Act Raloxifene")), class = "data.frame", row.names = c(NA, -3L
))
df2 <- structure(list(label = c("LIDOCAINE", "RALOXIFENE", "JANUMET",
"ESOMEPRAZOLE", "METFORMIN"), code = c(3, 5, 6, 1, 11)),
class = "data.frame", row.names = c(NA,
-5L))

R Subsetting text from a comma seperated column in a data-frame

I have a data.frame with a column that looks like that:
diagnosis
F.31.2,A.43.2,R.45.2,F.43.1
I want to somehow split this column into two colums with one containing all the values with F and one for all the other values, resulting in two columns in a df that looks like that.
F other
F.31.2,F43.1 A.43.2,R.45.2
Thanks in advance
Try next tidyverse approach. You can separate the rows by , and then create a group according to the pattern in order to reshape to wide and obtain the expected result:
library(dplyr)
library(tidyr)
#Data
df <- data.frame(diagnosis='F.31.2,A.43.2,R.45.2,F.43.1',stringsAsFactors = F)
#Code
new <- df %>% separate_rows(diagnosis,sep = ',') %>%
mutate(Group=ifelse(grepl('F',diagnosis),'F','Other')) %>%
pivot_wider(values_fn = toString,names_from=Group,values_from=diagnosis)
Output:
# A tibble: 1 x 2
F Other
<chr> <chr>
1 F.31.2, F.43.1 A.43.2, R.45.2
First, use strsplit at the commas. Then, using grep find indexes of F, and select/antiselect them by multiplying by 1 or -1 and paste them.
tmp <- el(strsplit(d$diagnosis, ","))
res <- lapply(c(1, -1), function(x) paste(tmp[grep("F", tmp)*x], collapse=","))
res <- setNames(as.data.frame(res), c("F", "other"))
res
# F other
# 1 F.31.2,F.43.1 A.43.2,R.45.2
Data:
d <- setNames(read.table(text="F.31.2,A.43.2,R.45.2,F.43.1"), "diagnosis")

How to replace with only the part before the ":" in every row of a column in R

so in a dataset, I have a column named "Interventions", and each row looks like this:
row1: "Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600"
row2: "Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
I want to only extract the Intervention type such as "Drug", "Biological", "Procedure" to remain in the column. And even better, if can only have the unique Intervention type instead of "Drug" 4 times like the first row.
The expected output would look like this:
row1: "Drug"
row2: "Biological, Drug, Procedure"
I am just getting started with r, I have tidyverse installed and kinda used to playing with the %>%. If anyone can help me with this, much appreciated !
If we want to extract only the prefix part before the :
library(dplyr)
library(stringr)
library(tidyr)
library(purrr)
df1 %>%
mutate(Interventions = map_chr(str_extract_all(Interventions,
"\\w+(?=:)"), ~ toString(sort(unique(.x)))))
# Interventions
#1 Drug
#2 Biological, Drug, Procedure
Or another option is to separate the rows based on the delimiters, slice the alternate rows and paste together the sorted unique values in 'Interventions'
df1 %>%
mutate(rn = row_number()) %>%
separate_rows(Interventions, sep="[:|]") %>%
group_by(rn) %>%
slice(seq(1, n(), by = 2)) %>%
distinct() %>%
summarise(Interventions = toString(sort(unique(Interventions)))) %>%
ungroup %>%
select(-rn)
# A tibble: 2 x 1
# Interventions
# <chr>
#1 Drug
#2 Biological, Drug, Procedure
data
df1 <- structure(list(Interventions = c("Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600",
"Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
)), class = "data.frame", row.names = c(NA, -2L))
Not as concise and the same logic as Akruns but in Base R:
# Create df:
df1 <- structure(list(Interventions = c("Drug: Rituximab|Drug: Utomilumab|Drug: Avelumab|Drug: PF04518600",
"Biological: alemtuzumab|Biological: donor lymphocytes|Drug: carmustine|Drug: cytarabine|Drug: etoposide|Drug: melphalan|Procedure: allogeneic bone marroow"
)), class = "data.frame", row.names = c(NA, -2L))
# Assign a row id vec:
df1$row_num <- 1:nrow(df1)
# Split string on | delim:
split_up <- strsplit(df1$Interventions, split = "[|]")
# Roll down the dataframe - keep uniques:
rolled_out <- unique(data.frame(row_num = rep(df1$row_num, sapply(split_up, length)),
Interventions = gsub("[:].*","", unlist(split_up))))
# Stack the dataframe:
df2 <- aggregate(Interventions~row_num, rolled_out, paste0, collapse = ", ")
# Drop id vec:
df2 <- within(df2, rm("row_num"))

How to delete rows in R, which contains the same "set" of data within a column

I have a dataframe of data with two columns lift and skill.set. Within skill.set, contains a string of skills separated by commas. For example, there might be an obversation with the string "Excel, PowerPoint" and another with "PowerPoint, Excel." These two observations are capturing the same data, and I only want to keep one of them (the one with the highest lift). I can't think of an efficient way of doing this without naively turning each string into a vector with elements separated by comma, and then writing a for-loop that compares each skill.set data to each other skill.set data.
Example dataframe:
df = structure(list(lift = c(5.71421247789905, 4.65329289252856, 5.87820023244231,
21.1815668998877), skill.set = c("JavaScript,Microsoft.Excel..MS.Excel.,Microsoft.Word,Python,Microsoft.PowerPoint",
"Microsoft.PowerPoint,Microsoft.Word,Python,SQL,Microsoft.Excel..MS.Excel.",
"Microsoft.Excel..MS.Excel.,Microsoft.Word,Python,SQL,Microsoft.PowerPoint",
"Analytics...Text.Mining,Natural.Language.Processing,Python")), .Names = c("lift",
"skill.set"), row.names = 239:242, class = "data.frame")
Desired dataframe:
structure(list(lift = c(5.71421247789905, 5.87820023244231, 21.1815668998877
), skill.set = c("JavaScript,Microsoft.Excel..MS.Excel.,Microsoft.Word,Python,Microsoft.PowerPoint",
"Microsoft.Excel..MS.Excel.,Microsoft.Word,Python,SQL,Microsoft.PowerPoint",
"Analytics...Text.Mining,Natural.Language.Processing,Python")), .Names = c("lift",
"skill.set"), row.names = c(239L, 241L, 242L), class = "data.frame")
We can do the following:
df[!duplicated(sapply(strsplit(df$skill.set, ","), function(x)
paste0(sort(x), collapse = ","))), ]
# lift
#239 5.714212
#240 4.653293
#242 21.181567
# skill.set
#239 JavaScript,Microsoft.Excel..MS.Excel.,Microsoft.Word,Python,Microsoft.PowerPoint
#240 Microsoft.PowerPoint,Microsoft.Word,Python,SQL,Microsoft.Excel..MS.Excel.
#242 Analytics...Text.Mining,Natural.Language.Processing,Python
Explanation: Split entries in df$skill.set on ",", then sort entries and concatenate; keep only non-duplicated entries.
Update
To only retain the row with the largest lift value we can use aggregate:
setNames(aggregate(
lift ~ sapply(strsplit(skill.set, ","), function(x) paste0(sort(x), collapse = ",")),
df,
max), rev(names(df)))
#1 Analytics...Text.Mining,Natural.Language.Processing,Python
#2 JavaScript,Microsoft.Excel..MS.Excel.,Microsoft.PowerPoint,Microsoft.Word,Python
#3 Microsoft.Excel..MS.Excel.,Microsoft.PowerPoint,Microsoft.Word,Python,SQL
# lift
#1 21.181567
#2 5.714212
#3 5.878200
Here is another solution using dplyr and tidyr.
df %>%
separate_rows(skill.set, sep = ",") %>%
group_by(lift) %>%
arrange(skill.set) %>%
mutate(id = row_number()) %>%
spread(id, skill.set) %>%
unite(skill.set, 2:6, sep = ",") %>%
group_by(skill.set) %>%
summarise_at(vars(lift), max)
You decide which lift you want to keep by changing max to whatever you prefer. Also, change 2:6 based on the column numbers that were produced by the preceding spread.

Sum by aggregating complex paired names in R

In R, I'm trying to aggregate a dataframe based on unique IDs, BUT I need to use some kind of wild card value for the IDs. Meaning I have paired names like this:
lion_tiger
elephant_lion
tiger_lion
And I need the lion_tiger and tiger_lion IDs to be summed together, because the order in the pair does not matter.
Using this dataframe as an example:
df <- data.frame(pair = c("1_3","2_4","2_2","1_2","2_1","4_2","3_1","4_3","3_2"),
value = c("12","10","19","2","34","29","13","3","14"))
So the values for pair IDs, "1_2" and "2_1" need to be summed in a new table. That new row would then read:
1_2 36
Any suggestions? While my example has numbers as the pair IDs, in reality I would need this to read in text (like the lion_tiger" example above).
We can split the 'pair' column by _, then sort and paste it back, use it in a group by function to get the sum
tapply(as.numeric(as.character(df$value)),
sapply(strsplit(as.character(df$pair), '_'), function(x)
paste(sort(as.numeric(x)), collapse="_")), FUN = sum)
Or another option is gsubfn
library(gsubfn)
df$pair <- gsubfn('([0-9]+)_([0-9]+)', ~paste(sort(as.numeric(c(x, y))), collapse='_'),
as.character(df$pair))
df$value <- as.numeric(as.character(df$value))
aggregate(value~pair, df, sum)
Using tidyverse and purrrlyr
df <- data.frame(name=c("lion_tiger","elephant_lion",
"tiger_lion"),value=c(1,2,3),stringsAsFactors=FALSE)
require(tidyverse)
require(purrrlyr)
df %>% separate(col = name, sep = "_", c("A", "B")) %>%
by_row(.collate = "rows",
..f = function(this_row) {
paste0(sort(c(this_row$A, this_row$B)), collapse = "_")
}) %>%
rename(sorted = ".out") %>%
group_by(sorted) %>%
summarize(sum(value))%>%show
## A tibble: 2 x 2
# sorted `sum(value)`
# <chr> <dbl>
#1 elephant_lion 2
#2 lion_tiger 4

Resources