R count matched strings - r

I am trying count matched items between strings:
target_str = "a,b,c"
table1 = data.frame(name = c("p1","p2","p3","p4"),
str = c("a,b","a","d,e,f","a,a"))
Based on target_str, count how many matches. I want my output table look like this:
name matches
p1 2 #matches a and b
p2 1 #matches a
p3 0 #no matches
p4 1 #if has duplicate, count only once
I have about 1 million target_strs that need to calculate the matches, so speed is very important. Appreciate any suggestions. Thanks in advance!

target_str = "a,b,c"
split_str <- strsplit(target_str, split = ",")[[1]]
table1 = data.frame(name = c("p1","p2","p3","p4"),
str = c("a,b","a","d,e,f","a,a"))
data.frame(name = table1$name,
matches = rowSums(sapply(split_str, grepl, x = table1$str)))
# name matches
# 1 p1 2
# 2 p2 1
# 3 p3 0
# 4 p4 1

This should be fairly fast:
# target string modified to be a character vector:
target_str <- unlist(strsplit(c("a,b,c"), split=","))
# separate each obervations strings:
stringList <- sapply(s, strsplit, split=",")
# get counts, put into data.frame
table1$Counts <- sapply(stringList, function(i) sum(i %in% target_str))

This cbinds counts to the first column, preserved as a dataframe with drop=FALSE. Counts are added from successive test for "in-ness" with grepl:
cbind( table1[ ,1,drop=FALSE], counts=rowSums(sapply( scan(text=target_str, sep= ",", what=""), function(t) { grepl( t, table1$str)})) )
Read 3 items
name counts
a p1 2
b p2 1
c p3 0

Related

Counting the frequency of differing patterns in a character string

I currently have a string in R that looks like this:
a <- "BMMBMMMMBMMMBMMBBMMM"
First, I need to determine the frequency of different patterns of "M" that appear in the string.
In this example it would be:
MM = 2
MMM = 2
MMMM = 1
Secondly, I then need to designate a numerical value/score for each different pattern.
i.e:
MM = 1
MMM = 2
MMMM = 3
This would mean that the total value/score of M's in a would equal 9.
If anyone knows any script that would allow me to do this for multiple strings like this in a dataframe that would be great?
Thank you.
a <- "BMMBMMMMBMMMBMMBBMMM"
tbl <- table(strsplit(a, "B"), exclude="")
tbl
# MM MMM MMMM
# 2 2 1
score <- sum(tbl * 1:3)
score
# 9
You could also use the table function.
a_list<-unlist(strsplit(a,"B"))
a_list<-a_list[!a_list==""] #remove cases when 2 B are together
a_list<-table(a_list)
# MM MMM MMMM
# 2 2 1
Here's a solution that uses the dplyr package. First, I load the library and define my string.
library(dplyr)
a <- "BMMBMMMMBMMMBMMBBMMM"
Next, I define a function that counts the occurrences of character x in string y.
char_count <- function(x, y){
# Get runs of same character
tmp <- rle(strsplit(y, split = "")[[1]])
# Count runs of character stored in `x`
tmp <- data.frame(table(tmp$lengths[tmp$values == x]))
# Return strings and frequencies
tmp %>%
mutate(String = strrep(x, Var1)) %>%
select(String, Freq)
}
Then, I run the function.
# Run the function
res <- char_count("M", a)
# String Freq
# 1 M 2
# 2 MM 2
# 3 MMM 1
Finally, I define my value vector and calculate the total value of vector a.
# My value vector
value_vec <- c(M = 1, MM = 2, MMM = 3)
# Total `value` of vector `a`
sum(value_vec * res$Freq)
#[1] 9
It it's acceptable to skip the first step you could do:
nchar(gsub("(B+M)|(^M)","",a))
# [1] 9
First compute all diffrent patterns that appear in your sting :
a <- "BMMBMMMMBMMMBMMBBMMM"
chars = unlist(strsplit(a, ""))
pat = c()
for ( i in 1:length(chars)){
for (j in 1:(length(chars) - i+1)){ pat = c(pat, paste(chars[j:(j+i-1)], collapse = ""))}}
pat =sort(unique(pat))
pat[1:5] : [1] "B" "BB" "BBM" "BBMM" "BBMMM"
Next, count the occurence of each pattern :
counts = sapply(pat, function(w) length(gregexpr(w, a, fixed = TRUE)[[1]]))
Finally build a nice dataframe to summary everything up :
df = data.frame(counts = counts, num = 1:length(pat))
head(df, 10)
counts num
B 6 1
BB 1 2
BBM 1 3
BBMM 1 4
BBMMM 1 5
BM 5 6
BMM 5 7
BMMB 2 8
BMMBB 1 9
BMMBBM 1 10
library(stringr)
str_count(a, "MMMM")
gives 1
str_count(gsub("MMMM", "", a), "MMM") # now count how many times "MMM" occurs, but first delete the "MMMM"
gives 2
str_count(gsub("MMM", "", a), "MM") #now count how many times "MM" occurs, but first delete the "MMM"'s
gives 2

String manipulation in r where contents of interest are in a different order

I have dataframe where I am attempting to extract content of a column and then append it to the dataframe as a new column.
For example my dataframe looks like:
> head(df)
id event_params
1 {"type":"L","maximumangle":-87.618,"duration":25}
2 {"type":"L","maximumangle":1.62,"duration":25}
3 {"maximumangle":-29.661,"type":"L","duration":20}
I wish to extract the maximum angle, and then append this to the existing dataframe as a new column titled maximumangle. My initial thought was to use the grep function. However, since maximumangle does not appear in the same order in each row, this will not work.
What can I do to achieve what I want?
1) Parse the last column using fromJSON in the rjson package. This adds all the JSON data.
library(rjson)
L <- lapply(as.character(DF$event_params), fromJSON)
cbind(DF, do.call("rbind", lapply(L, as.data.frame, stringsAsFactors = FALSE)))
giving:
id event_params type maximumangle duration
1 1 {"type":"L","maximumangle":-87.618,"duration":25} L -87.618 25
2 2 {"type":"L","maximumangle":1.62,"duration":25} L 1.620 25
3 3 {"maximumangle":-29.661,"type":"L","duration":20} L -29.661 20
2) We can simplify this slightly if you really only need maximumangle:
maximumangle <- function(x) fromJSON(as.character(x))$maximumangle
transform(DF, maximumangle = sapply(DF$event_params, maximumangle, USE.NAMES = FALSE))
giving:
id event_params maximumangle
1 1 {"type":"L","maximumangle":-87.618,"duration":25} -87.618
2 2 {"type":"L","maximumangle":1.62,"duration":25} 1.620
3 3 {"maximumangle":-29.661,"type":"L","duration":20} -29.661
Note
We assumed that the input in reproducible form is given by:
Lines <- '
id event_params
1 {"type":"L","maximumangle":-87.618,"duration":25}
2 {"type":"L","maximumangle":1.62,"duration":25}
3 {"maximumangle":-29.661,"type":"L","duration":20}'
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
1) We can use str_extract from stringr by using a regex lookaround to match the string 'maximumangle' followed by a quote (") and colon (:) and extract the pattern the follows it i.e. zero or more - (-*) followed by numbers with digits ([0-9.]+)
library(dplyr)
library(stringr)
df %>%
mutate(maximumangle = as.numeric(str_extract(event_params,
'(?<=maximumangle":)-*[0-9.]+')))
# id event_params maximumangle
#1 1 {"type":"L","maximumangle":-87.618,"duration":25} -87.618
#2 2 {"type":"L","maximumangle":1.62,"duration":25} 1.620
#3 3 {"maximumangle":-29.661,"type":"L","duration":20} -29.661
2) Or the same can be done with base R using regexpr/regmatches
df$maximumangle <- as.numeric(regmatches(df$event_params,
regexpr('(?<=maximumangle":)-*[0-9.]+', df$event_params, perl = TRUE)))
data
df <- structure(list(id = 1:3, event_params = c("{\"type\":\"L\",\"maximumangle\":-87.618,\"duration\":25}",
"{\"type\":\"L\",\"maximumangle\":1.62,\"duration\":25}", "{\"maximumangle\":-29.661,\"type\":\"L\",\"duration\":20}"
)), .Names = c("id", "event_params"), class = "data.frame", row.names = c(NA,
-3L))

Improvement and parallelization of a code that produces combinations of specified string characters

First of all let me try explain you what the following code is doing.
From a list like the above it takes the containing string "1MAKK" and tries to find the possible combinations of the positions of the specified characters at chars
Here is an example of the initial conformation
# Initial list
lst1 = list("P1"=list("1MAKK") )
chars = c("M","K")
classes = c("class.1","class.35")
# Get the P name
p_name = names(lst1[1])
# Get the string sequence
p_seq = unlist(lst1[[1]][1])
The classes list is nothing more than some labels corresponding to the chars list and just used for some naming.
Now the main code is getting these variables p_name , p_seq and produces a data frame with all possible combinations of the position combinations of the specified characters.
This is the code:
library(stringr) # str_locate
library(purrr) # map2
# Functions
move_one <- function(seq){
if(grepl("1" , seq))
seq = paste0(substring(seq,2),1)
else
seq
}
# Move the number one from the first to last position
seq = move_one(p_seq)
# Get the positions of each character in the string
pos = unlist( map2(
.f=function(a ,p) str_locate_all(p, a) ,
.x=chars ,
.y=seq),
recursive = F
)
# Check if there is a letter that didn't exist in the sequence and add zeros at the respective list item
for( x in 1:length(pos)){
ifelse(is.na(pos[[x]][1]) , pos[[x]] <- rbind(pos[[x]] , c(0,0)) , pos[[x]] <- pos[[x]] )
}
# Calculate all possible combinations and transpose the arrays inside the list
ind1 = pmap(
.f = function(x) lapply(1:nrow(pos[[x]]), combn, x=as.list(pos[[x]][,1])),
.l = list( 1:length(pos) )
)
ind1 = pmap(
.f = function(x) lapply(ind1[[x]], t) ,
.l = list( 1:length(ind1) )
)
# Add Zero at each first element
z = pmap(
.f = function(x) lapply(ind1[[x]][1] , rbind , 0 ) ,
.l = list( 1:length(ind1) )
)
# Merge the list with the zeros and the complete one
ind1 = map2(
.f = function(a,b) {a[1]<-b[1]; a},
.x = ind1,
.y = z)
# Create a vector for each letter combination
ind1 = pmap(
.f = function (x) unlist( lapply(ind1[[x]], function(i) do.call(paste, c(as.data.frame(i), sep = ':'))) ),
.l = list ( 1:length(ind1) )
)
# Get the position of the class.1
isClass1 = grep("class.1", classes)
# Check if the seq is the first one
isFirst = grepl("1",seq)
# Set only 1 and 0 in the vector of UNIMOD.1 if is the first peptide
ifelse(isFirst , ind1[[isClass1]] <- c("1","0") , ind1[[isClass1]] <- c("0") )
# expand.grid for all these vectors inside ind1
ind2 = expand.grid(ind1)
# Apply column names in ind2
colnames(ind2) = classes
# Add a column with the p_name and seq
ind3 = cbind( "p_name"=rep(p_name, nrow(ind2) ) , "seq"=rep( gsub('.{1}$','',seq) , nrow(ind2) ) , ind2 )
The result for that specific input will be
> ind3
p_name seq class.1 class.35
1 P1 MAKK 1 3
2 P1 MAKK 0 3
3 P1 MAKK 1 4
4 P1 MAKK 0 4
5 P1 MAKK 1 0
6 P1 MAKK 0 0
7 P1 MAKK 1 3:4
8 P1 MAKK 0 3:4
As you can see I tried to use lapply, map2,pmap methods and not for loops in order to make it faster and to give it a chance to run in more than one CPU core in the final version.
So somewhere here is where I need your help and your opinion.
The actual initial list of mine does not have only one string character but it looks like the following one, but with the difference that there are thousands of inner lists (Px where x = {1,2,3,4,...2000} and each Px could have around a hundred of sequences.
p_list = list( "P1" = list( c("1MAK","ERTD","FTRWDSE" )) , "P2" = list( c("1MERTDF","DFRGRSDFG","DFFF")) )
The first question and probably the easiest one to answer, is how can I run (apply) the above code in such a list.
And secondly how can I implement this to be calculated in parallel and use more than one of CPU cores from a server that has 24 of them in order to save some time.
P.S: The final result expected to be the combination of all the individual results (using rbind maybe), (like the one previously showed) into a data frame.
Any improvement, idea or suggestion is welcomed.
Thank you.
First part
This is basically the code I would use for one string. At the end, you get column-lists (which is nice is you know what it is).
library(purrr)
x <- "MAKK"
chars <- set_names(c("M", "K"), c("class.1", "class.35"))
get_0_and_all_combn <- function(x) {
map(seq_along(x), function(i) combn(x, i, simplify = FALSE)) %>%
unlist(recursive = FALSE) %>%
c(0L, .)
}
get_0_and_all_combn(3:4)
[[1]]
[1] 0
[[2]]
[1] 3
[[3]]
[1] 4
[[4]]
[1] 3 4
get_pos_combn <- function(x, chars) {
x.spl <- strsplit(x, "")[[1]]
map(chars, function(chr) {
which(x.spl == chr) %>%
get_0_and_all_combn()
}) %>%
expand.grid()
}
get_pos_combn(x, chars)
class.1 class.35
1 0 0
2 1 0
3 0 3
4 1 3
5 0 4
6 1 4
7 0 3, 4
8 1 3, 4
get_pos_combn_with_infos <- function(seq, chars, p_name) {
cbind.data.frame(p_name, seq, get_pos_combn(seq, chars))
}
get_pos_combn_with_infos(x, chars, p_name)
p_name seq class.1 class.35
1 P1 MAKK 0 0
2 P1 MAKK 1 0
3 P1 MAKK 0 3
4 P1 MAKK 1 3
5 P1 MAKK 0 4
6 P1 MAKK 1 4
7 P1 MAKK 0 3, 4
8 P1 MAKK 1 3, 4
So, now if you want me to finish my answer, I would need to know which are the chars and classes corresponding to your complete example
p_list = list( "P1" = list( c("1MAK","ERTD","FTRWDSE" )) ,
"P2" = list( c("1MERTDF","DFRGRSDFG","DFFF")) )
Also, are you sure you want to make "P1" and "P2" lists of length 1?

How can I insert values into a data frame dynamically using R

After scraping some review data from a website, I am having difficulty organizing the data into a useful structure for analysis. The problem is that the data is dynamic, in that each reviewer gave ratings on anywhere between 0 and 3 subcategories (denoted as subcategories "a", "b" and "c"). I would like to organize the reviews so that each row is a different reviewer, and each column is a subcategory that was rated. Where reviewers chose not to rate a subcategory, I would like that missing data to be 'NA'. Here is a simplified sample of the data:
vec <- c("a","b","c","stop", "a","b","stop", "stop", "c","stop")
ratings <- c(2,5,1, 1,3, 2)
The vec contains the information of the subcategories that were scored, and the "stop" is the end of each reviewers rating. As such, I would like to organize the result into a data frame with this structure. Expected Output
I would greatly appreciate any help on this, because I've been working on this issue for far longer than it should take me..
#alexis_laz provided what I believe is the best answer:
vec <- c("a","b","c","stop", "a","b","stop", "stop", "c","stop")
ratings <- c(2,5,1, 1,3, 2)
stops <- vec == "stop"
i = cumsum(stops)[!stops] + 1L
j = vec[!stops]
tapply(ratings, list(factor(i, 1:max(i)), factor(j)), identity) # although mean/sum work
# a b c
#[1,] 2 5 1
#[2,] 1 3 NA
#[3,] NA NA NA
#[4,] NA NA 2
base R, but I'm using a for loop...
vec <- c("a","b","c","stop", "a","b","stop", "stop", "c","stop")
ratings <- c(2,5,1, 1,3, 2)
categories <- unique(vec)[unique(vec)!="stop"]
row = 1
df = data.frame(lapply(categories, function(x){NA_integer_}))
colnames(df) <- categories
rating = 1
for(i in vec) {
if(i=='stop') {row <- row+1
} else { df[row,i] <- ratings[[rating]]; rating <- rating+1}
}
Here is one option
library(data.table)
library(reshape2)
d1 <- as.data.table(melt(split(vec, c(1, head(cumsum(vec == "stop")+1,
-1)))))[value != 'stop', ratings := ratings
][value != 'stop'][, value := as.character(value)][, L1 := as.integer(L1)]
dcast( d1[CJ(value = value, L1 = seq_len(max(L1)), unique = TRUE), on = .(value, L1)],
L1 ~value, value.var = 'ratings')[, L1 := NULL][]
# a b c
#1: 2 5 1
#2: 1 3 NA
#3: NA NA NA
#4: NA NA 2
Using base R functions and rbind.fill from plyr or rbindlist from data.table to produce the final object, we can do
# convert vec into a list, split by "stop", dropping final element
temp <- head(strsplit(readLines(textConnection(paste(gsub("stop", "\n", vec, fixed=TRUE),
collapse=" "))), split=" "), -1)
# remove empty strings, but maintain empty list elements
temp <- lapply(temp, function(x) x[nchar(x) > 0])
# match up appropriate names to the individual elements in the list with setNames
# convert vectors to single row data.frames
temp <- Map(function(x, y) setNames(as.data.frame.list(x), y),
relist(ratings, skeleton = temp), temp)
# add silly data.frame (single row, single column) for any empty data.frames in list
temp <- lapply(temp, function(x) if(nrow(x) > 0) x else setNames(data.frame(NA), vec[1]))
Now, you can produce the single data.frame (data.table) with either plyr or data.table
# with plyr, returns data.frame
library(plyr)
do.call(rbind.fill, temp)
a b c
1 2 5 1
2 1 3 NA
3 NA NA NA
4 NA NA 2
# with data.table, returns data.table
rbindlist(temp, fill=TRUE)
a b c
1: 2 5 1
2: 1 3 NA
3: NA NA NA
4: NA NA 2
Note that the line prior to the rbinding can be replaced with
temp[lengths(temp) == 0] <- replicate(sum(lengths(temp) == 0),
setNames(data.frame(NA), vec[1]), simplify=FALSE)
where the list items that are empty data frames are replaced using subsetting instead of an lapply over the entire list.

Adding a new column to each element in a list of tables or data frames

I have a list of files. I also have a list of "names" which I substr() from the actual filenames of these files. I would like to add a new column to each of the files in the list. This column will contain the corresponding element in "names" repeated times the number of rows in the file.
For example:
df1 <- data.frame(x = 1:3, y=letters[1:3])
df2 <- data.frame(x = 4:6, y=letters[4:6])
filelist <- list(df1,df2)
ID <- c("1A","IB")
Pseudocode
for( i in length(filelist)){
filelist[i]$SampleID <- rep(ID[i],nrow(filelist[i])
}
// basically create a new column in each of the dataframes in filelist, and fill the column with repeted corresponding values of ID
my output should be like:
filelist[1] should be:
x y SAmpleID
1 1 a 1A
2 2 b 1A
3 3 c 1A
fileList[2]
x y SampleID
1 4 d IB
2 5 e IB
3 6 f IB
and so on.....
Any Idea how it could be done.
An alternate solution is to use cbind, and taking advantage of the fact that R will recylce values of a shorter vector.
For Example
x <- df2 # from above
cbind(x, NewColumn="Singleton")
# x y NewColumn
# 1 4 d Singleton
# 2 5 e Singleton
# 3 6 f Singleton
There is no need for the use of rep. R does that for you.
Therfore, you could put cbind(filelist[[i]], ID[[i]]) in your for loop or as #Sven pointed out, you can use the cleaner mapply:
filelist <- mapply(cbind, filelist, "SampleID"=ID, SIMPLIFY=F)
This is a corrected version of your loop:
for( i in seq_along(filelist)){
filelist[[i]]$SampleID <- rep(ID[i],nrow(filelist[[i]]))
}
There were 3 problems:
A final ) was missing after the command in the body.
Elements of lists are accessed by [[, not by [. [ returns a list of length one. [[ returns the element only.
length(filelist) is just one value, so the loop runs for the last element of the list only. I replaced it with seq_along(filelist).
A more efficient approach is to use mapply for the task:
mapply(function(x, y) "[<-"(x, "SampleID", value = y) ,
filelist, ID, SIMPLIFY = FALSE)
This one worked for me:
Create a new column for every dataframe in a list; fill the values of the new column based on existing column. (In your case IDs).
Example:
# Create dummy data
df1<-data.frame(a = c(1,2,3))
df2<-data.frame(a = c(5,6,7))
# Create a list
l<-list(df1, df2)
> l
[[1]]
a
1 1
2 2
3 3
[[2]]
a
1 5
2 6
3 7
# add new column 'b'
# create 'b' values based on column 'a'
l2<-lapply(l, function(x)
cbind(x, b = x$a*4))
Results in:
> l2
[[1]]
a b
1 1 4
2 2 8
3 3 12
[[2]]
a b
1 5 20
2 6 24
3 7 28
In your case something like:
filelist<-lapply(filelist, function(x)
cbind(x, b = x$SampleID))
The purrr way, using map2
library(dplyr)
library(purrr)
map2(filelist, ID, ~cbind(.x, SampleID = .y))
#[[1]]
# x y SampleId
#1 1 a 1A
#2 2 b 1A
#3 3 c 1A
#[[2]]
# x y SampleId
#1 4 d IB
#2 5 e IB
#3 6 f IB
Or can also use
map2(filelist, ID, ~.x %>% mutate(SampleId = .y))
If you name the list, we can use imap and add the new column based on it's name.
names(filelist) <- c("1A","IB")
imap(filelist, ~cbind(.x, SampleID = .y))
#OR
#imap(filelist, ~.x %>% mutate(SampleId = .y))
which is similar to using Map
Map(cbind, filelist, SampleID = names(filelist))
A tricky way:
library(plyr)
names(filelist) <- ID
result <- ldply(filelist, data.frame)
data_lst <- list(
data_1 = data.frame(c1 = 1:3, c2 = 3:1),
data_2 = data.frame(c1 = 1:3, c2 = 3:1)
)
f <- function (data, name){
data$name <- name
data
}
Map(f, data_lst , names(data_lst))

Resources