For loop with an index inside a string in R - r

I have a rather simple problem but somehow I cannot solve it.
So I have a dataset with a column cycle with rows cycle1, cycle2, cycle3. I want to replace e.g. the word cycle1 with just the number 1. How to somehow separate the index i from the string cycle?
for (i in 1:3){
data$cycle[data$cycle=="cyclei"]<-i
}

Replace "cycle" with the empty string and convert to numeric:
data <- data.frame(cycle = c("cycle2", "cycle1", "cycle3")) # sample input
transform(data, cycle = as.numeric(sub("cycle", "", cycle)))
giving:
cycle
1 2
2 1
3 3

Use gsub()
# load data
df <- data.frame( cycle = c( "cycle1", "cycle2", "cycle3" ), stringsAsFactors = FALSE )
# Identify the pattern and replace with nothing
# and cast the values as numeric
df$cycle <- as.numeric( gsub( pattern = "cycle", replacement = "", x = df$cycle ) )
# view results
df
# cycle
# 1 1
# 2 2
# 3 3
# end of script #

Related

R - split dataset by row position and save in different files

I have a huge dataset in which several mini dataset were merged. I want to split them in different dataframes and save them. The mini datasets are identified by a variable name (which always include the string "-gram") on a given row.
I have been trying to construct a for loop, but with no luck.
grams <- read.delim("grams.tsv", header=FALSE) #read dataset
index <- which(grepl("-gram", grams$V1), arr.ind=TRUE) # identify the row positions where each mini dataset starts
index[10] <- nrow(grams) # add the total number of rows as last variable of the vector
start <- c() # initialize vector
end <- c() # initialize vector
for (i in 1:length(index)-1) for ( k in 2:length(index)) {
start[i] <- index[i] # add value to the vector start
if (k != 10) { end[k-1] <- index[k]-1 } else { end[k-1] <- index[k] } # add value to the vector end
gram <- grams[start[i]:end[i],] #subset the dataset grams so that the split mini dataset has start and end that correspond to the index in the vector
write.csv(gram, file=paste0("grams_", i, ".csv"), row.names=FALSE) # save dataset
}
I get an error when I try to subset the dataset:
Error in start[i]:end[i] : argument of length 0
...and I do not understand why! Can anyone help me?
Thanks!
You can cumsum and split:
dat <- data.frame(V1 = c("foo", "bar", "quux-gram", "bar-gram", "something", "nothing"),
V2 = 1:6, stringsAsFactors = FALSE)
dat
# V1 V2
# 1 foo 1
# 2 bar 2
# 3 quux-gram 3
# 4 bar-gram 4
# 5 something 5
# 6 nothing 6
grepl("-gram$", dat$V1)
# [1] FALSE FALSE TRUE TRUE FALSE FALSE
cumsum(grepl("-gram$", dat$V1))
# [1] 0 0 1 2 2 2
spl_dat <- split(dat, cumsum(grepl("-gram$", dat$V1)))
spl_dat
# $`0`
# V1 V2
# 1 foo 1
# 2 bar 2
# $`1`
# V1 V2
# 3 quux-gram 3
# $`2`
# V1 V2
# 4 bar-gram 4
# 5 something 5
# 6 nothing 6
With that, you can write them to files with:
ign <- Map(write.csv, spl_dat, sprintf("gram-%03d.csv", seq_along(spl_dat)),
list(row.names=FALSE))
An option with group_split and endsWith
library(dplyr)
library(stringr)
dat %>%
group_split(grp = cumsum(endsWith(V1, '-gram')), keep = FALSE)

Using lapply with multiple function inputs without nesting

I have a data frame with four columns: two columns indicate participation in a sport, while the other two columns indicate whether the player passed each of their two fitness exams.
dat <- data.frame(SOCCER = sample(0:1, 10, replace = T),
BASEBALL = sample(0:1, 10, replace = T),
TEST_1_PASS = sample(0:1, 10, replace = T),
TEST_2_PASS = sample(0:1, 10, replace = T))
I would like to obtain a list containing contingency tables for each sport and exam. I know that I can accomplish this using the following code, which uses nested lapply statements, but this strikes me as inefficient. Can anyone propose a more elegant solution that doesn't use nesting?
results <- lapply(c("SOCCER", "BASEBALL"), function(x) {
lapply(c("TEST_1_PASS", "TEST_2_PASS"), function(y){
table(sport = dat[[x]], pass = dat[[y]])
})
})
Thanks as always!
The double lapply gets all pairwise combinations of the columns in each of the columns' vectors, like #Gregor wrote in a comment
I don't think your nesting is inefficient... you need to call table 4
times. It doesn't really matter if that's inside one loop/lapply over
4 items or 2 nested loops/lapplys with 2 items each.
But here is another way, with one of the loops in disguise as expand.grid.
cols <- expand.grid(x = c("SOCCER", "BASEBALL"),
y = c("TEST_1_PASS", "TEST_2_PASS"),
stringsAsFactors = FALSE)
Map(function(.x, .y)table(dat[[.x]], dat[[.y]]), cols$x, cols$y)
Consider re-formatting your data into long format (i.e., tidy data), merging the two data pieces of sport and exam, then run by (a rarely used member of apply family as object-oriented wrapper to tapply) for all subset combinations between the two returning a named header report of results:
# RESHAPE EACH DATA SECTION (SPORT AND EXAM) INTO LONG FORMAT
df_list <- lapply(list(c("SOCCER", "BASEBALL"), c("TEST_1_PASS", "TEST_2_PASS")), function(cols)
reshape(cbind(PLAYER = row.names(dat), dat[cols]),
varying = cols, v.names = "VALUE",
times = cols, timevar = "INDICATOR",
idvar = "PLAYER", ids = NULL,
new.row.names = 1:1E4, direction = "long")
)
# CROSS JOIN (ALL COMBINATION PAIRINGS)
final_df <- Reduce(function(x,y) merge(x, y, by="PLAYER", suffixes=c("_SPORT", "_EXAM")), df_list)
final_df
# RUN TABLES FOR EACH SUBSET COMBINATION
tables_list <- with(final_df, by(final_df, list(INDICATOR_SPORT, INDICATOR_EXAM), function(sub)
table(sport = sub$VALUE_SPORT, pass = sub$VALUE_EXAM)
)
)
Output
tables_list
# : BASEBALL
# : TEST_1_PASS
# pass
# sport 0 1
# 0 3 4
# 1 2 1
# ------------------------------------------------------------
# : SOCCER
# : TEST_1_PASS
# pass
# sport 0 1
# 0 2 0
# 1 3 5
# ------------------------------------------------------------
# : BASEBALL
# : TEST_2_PASS
# pass
# sport 0 1
# 0 3 4
# 1 1 2
# ------------------------------------------------------------
# : SOCCER
# : TEST_2_PASS
# pass
# sport 0 1
# 0 2 0
# 1 2 6
Online Demo

Counting the frequency of differing patterns in a character string

I currently have a string in R that looks like this:
a <- "BMMBMMMMBMMMBMMBBMMM"
First, I need to determine the frequency of different patterns of "M" that appear in the string.
In this example it would be:
MM = 2
MMM = 2
MMMM = 1
Secondly, I then need to designate a numerical value/score for each different pattern.
i.e:
MM = 1
MMM = 2
MMMM = 3
This would mean that the total value/score of M's in a would equal 9.
If anyone knows any script that would allow me to do this for multiple strings like this in a dataframe that would be great?
Thank you.
a <- "BMMBMMMMBMMMBMMBBMMM"
tbl <- table(strsplit(a, "B"), exclude="")
tbl
# MM MMM MMMM
# 2 2 1
score <- sum(tbl * 1:3)
score
# 9
You could also use the table function.
a_list<-unlist(strsplit(a,"B"))
a_list<-a_list[!a_list==""] #remove cases when 2 B are together
a_list<-table(a_list)
# MM MMM MMMM
# 2 2 1
Here's a solution that uses the dplyr package. First, I load the library and define my string.
library(dplyr)
a <- "BMMBMMMMBMMMBMMBBMMM"
Next, I define a function that counts the occurrences of character x in string y.
char_count <- function(x, y){
# Get runs of same character
tmp <- rle(strsplit(y, split = "")[[1]])
# Count runs of character stored in `x`
tmp <- data.frame(table(tmp$lengths[tmp$values == x]))
# Return strings and frequencies
tmp %>%
mutate(String = strrep(x, Var1)) %>%
select(String, Freq)
}
Then, I run the function.
# Run the function
res <- char_count("M", a)
# String Freq
# 1 M 2
# 2 MM 2
# 3 MMM 1
Finally, I define my value vector and calculate the total value of vector a.
# My value vector
value_vec <- c(M = 1, MM = 2, MMM = 3)
# Total `value` of vector `a`
sum(value_vec * res$Freq)
#[1] 9
It it's acceptable to skip the first step you could do:
nchar(gsub("(B+M)|(^M)","",a))
# [1] 9
First compute all diffrent patterns that appear in your sting :
a <- "BMMBMMMMBMMMBMMBBMMM"
chars = unlist(strsplit(a, ""))
pat = c()
for ( i in 1:length(chars)){
for (j in 1:(length(chars) - i+1)){ pat = c(pat, paste(chars[j:(j+i-1)], collapse = ""))}}
pat =sort(unique(pat))
pat[1:5] : [1] "B" "BB" "BBM" "BBMM" "BBMMM"
Next, count the occurence of each pattern :
counts = sapply(pat, function(w) length(gregexpr(w, a, fixed = TRUE)[[1]]))
Finally build a nice dataframe to summary everything up :
df = data.frame(counts = counts, num = 1:length(pat))
head(df, 10)
counts num
B 6 1
BB 1 2
BBM 1 3
BBMM 1 4
BBMMM 1 5
BM 5 6
BMM 5 7
BMMB 2 8
BMMBB 1 9
BMMBBM 1 10
library(stringr)
str_count(a, "MMMM")
gives 1
str_count(gsub("MMMM", "", a), "MMM") # now count how many times "MMM" occurs, but first delete the "MMMM"
gives 2
str_count(gsub("MMM", "", a), "MM") #now count how many times "MM" occurs, but first delete the "MMM"'s
gives 2

String manipulation in r where contents of interest are in a different order

I have dataframe where I am attempting to extract content of a column and then append it to the dataframe as a new column.
For example my dataframe looks like:
> head(df)
id event_params
1 {"type":"L","maximumangle":-87.618,"duration":25}
2 {"type":"L","maximumangle":1.62,"duration":25}
3 {"maximumangle":-29.661,"type":"L","duration":20}
I wish to extract the maximum angle, and then append this to the existing dataframe as a new column titled maximumangle. My initial thought was to use the grep function. However, since maximumangle does not appear in the same order in each row, this will not work.
What can I do to achieve what I want?
1) Parse the last column using fromJSON in the rjson package. This adds all the JSON data.
library(rjson)
L <- lapply(as.character(DF$event_params), fromJSON)
cbind(DF, do.call("rbind", lapply(L, as.data.frame, stringsAsFactors = FALSE)))
giving:
id event_params type maximumangle duration
1 1 {"type":"L","maximumangle":-87.618,"duration":25} L -87.618 25
2 2 {"type":"L","maximumangle":1.62,"duration":25} L 1.620 25
3 3 {"maximumangle":-29.661,"type":"L","duration":20} L -29.661 20
2) We can simplify this slightly if you really only need maximumangle:
maximumangle <- function(x) fromJSON(as.character(x))$maximumangle
transform(DF, maximumangle = sapply(DF$event_params, maximumangle, USE.NAMES = FALSE))
giving:
id event_params maximumangle
1 1 {"type":"L","maximumangle":-87.618,"duration":25} -87.618
2 2 {"type":"L","maximumangle":1.62,"duration":25} 1.620
3 3 {"maximumangle":-29.661,"type":"L","duration":20} -29.661
Note
We assumed that the input in reproducible form is given by:
Lines <- '
id event_params
1 {"type":"L","maximumangle":-87.618,"duration":25}
2 {"type":"L","maximumangle":1.62,"duration":25}
3 {"maximumangle":-29.661,"type":"L","duration":20}'
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
1) We can use str_extract from stringr by using a regex lookaround to match the string 'maximumangle' followed by a quote (") and colon (:) and extract the pattern the follows it i.e. zero or more - (-*) followed by numbers with digits ([0-9.]+)
library(dplyr)
library(stringr)
df %>%
mutate(maximumangle = as.numeric(str_extract(event_params,
'(?<=maximumangle":)-*[0-9.]+')))
# id event_params maximumangle
#1 1 {"type":"L","maximumangle":-87.618,"duration":25} -87.618
#2 2 {"type":"L","maximumangle":1.62,"duration":25} 1.620
#3 3 {"maximumangle":-29.661,"type":"L","duration":20} -29.661
2) Or the same can be done with base R using regexpr/regmatches
df$maximumangle <- as.numeric(regmatches(df$event_params,
regexpr('(?<=maximumangle":)-*[0-9.]+', df$event_params, perl = TRUE)))
data
df <- structure(list(id = 1:3, event_params = c("{\"type\":\"L\",\"maximumangle\":-87.618,\"duration\":25}",
"{\"type\":\"L\",\"maximumangle\":1.62,\"duration\":25}", "{\"maximumangle\":-29.661,\"type\":\"L\",\"duration\":20}"
)), .Names = c("id", "event_params"), class = "data.frame", row.names = c(NA,
-3L))

Improvement and parallelization of a code that produces combinations of specified string characters

First of all let me try explain you what the following code is doing.
From a list like the above it takes the containing string "1MAKK" and tries to find the possible combinations of the positions of the specified characters at chars
Here is an example of the initial conformation
# Initial list
lst1 = list("P1"=list("1MAKK") )
chars = c("M","K")
classes = c("class.1","class.35")
# Get the P name
p_name = names(lst1[1])
# Get the string sequence
p_seq = unlist(lst1[[1]][1])
The classes list is nothing more than some labels corresponding to the chars list and just used for some naming.
Now the main code is getting these variables p_name , p_seq and produces a data frame with all possible combinations of the position combinations of the specified characters.
This is the code:
library(stringr) # str_locate
library(purrr) # map2
# Functions
move_one <- function(seq){
if(grepl("1" , seq))
seq = paste0(substring(seq,2),1)
else
seq
}
# Move the number one from the first to last position
seq = move_one(p_seq)
# Get the positions of each character in the string
pos = unlist( map2(
.f=function(a ,p) str_locate_all(p, a) ,
.x=chars ,
.y=seq),
recursive = F
)
# Check if there is a letter that didn't exist in the sequence and add zeros at the respective list item
for( x in 1:length(pos)){
ifelse(is.na(pos[[x]][1]) , pos[[x]] <- rbind(pos[[x]] , c(0,0)) , pos[[x]] <- pos[[x]] )
}
# Calculate all possible combinations and transpose the arrays inside the list
ind1 = pmap(
.f = function(x) lapply(1:nrow(pos[[x]]), combn, x=as.list(pos[[x]][,1])),
.l = list( 1:length(pos) )
)
ind1 = pmap(
.f = function(x) lapply(ind1[[x]], t) ,
.l = list( 1:length(ind1) )
)
# Add Zero at each first element
z = pmap(
.f = function(x) lapply(ind1[[x]][1] , rbind , 0 ) ,
.l = list( 1:length(ind1) )
)
# Merge the list with the zeros and the complete one
ind1 = map2(
.f = function(a,b) {a[1]<-b[1]; a},
.x = ind1,
.y = z)
# Create a vector for each letter combination
ind1 = pmap(
.f = function (x) unlist( lapply(ind1[[x]], function(i) do.call(paste, c(as.data.frame(i), sep = ':'))) ),
.l = list ( 1:length(ind1) )
)
# Get the position of the class.1
isClass1 = grep("class.1", classes)
# Check if the seq is the first one
isFirst = grepl("1",seq)
# Set only 1 and 0 in the vector of UNIMOD.1 if is the first peptide
ifelse(isFirst , ind1[[isClass1]] <- c("1","0") , ind1[[isClass1]] <- c("0") )
# expand.grid for all these vectors inside ind1
ind2 = expand.grid(ind1)
# Apply column names in ind2
colnames(ind2) = classes
# Add a column with the p_name and seq
ind3 = cbind( "p_name"=rep(p_name, nrow(ind2) ) , "seq"=rep( gsub('.{1}$','',seq) , nrow(ind2) ) , ind2 )
The result for that specific input will be
> ind3
p_name seq class.1 class.35
1 P1 MAKK 1 3
2 P1 MAKK 0 3
3 P1 MAKK 1 4
4 P1 MAKK 0 4
5 P1 MAKK 1 0
6 P1 MAKK 0 0
7 P1 MAKK 1 3:4
8 P1 MAKK 0 3:4
As you can see I tried to use lapply, map2,pmap methods and not for loops in order to make it faster and to give it a chance to run in more than one CPU core in the final version.
So somewhere here is where I need your help and your opinion.
The actual initial list of mine does not have only one string character but it looks like the following one, but with the difference that there are thousands of inner lists (Px where x = {1,2,3,4,...2000} and each Px could have around a hundred of sequences.
p_list = list( "P1" = list( c("1MAK","ERTD","FTRWDSE" )) , "P2" = list( c("1MERTDF","DFRGRSDFG","DFFF")) )
The first question and probably the easiest one to answer, is how can I run (apply) the above code in such a list.
And secondly how can I implement this to be calculated in parallel and use more than one of CPU cores from a server that has 24 of them in order to save some time.
P.S: The final result expected to be the combination of all the individual results (using rbind maybe), (like the one previously showed) into a data frame.
Any improvement, idea or suggestion is welcomed.
Thank you.
First part
This is basically the code I would use for one string. At the end, you get column-lists (which is nice is you know what it is).
library(purrr)
x <- "MAKK"
chars <- set_names(c("M", "K"), c("class.1", "class.35"))
get_0_and_all_combn <- function(x) {
map(seq_along(x), function(i) combn(x, i, simplify = FALSE)) %>%
unlist(recursive = FALSE) %>%
c(0L, .)
}
get_0_and_all_combn(3:4)
[[1]]
[1] 0
[[2]]
[1] 3
[[3]]
[1] 4
[[4]]
[1] 3 4
get_pos_combn <- function(x, chars) {
x.spl <- strsplit(x, "")[[1]]
map(chars, function(chr) {
which(x.spl == chr) %>%
get_0_and_all_combn()
}) %>%
expand.grid()
}
get_pos_combn(x, chars)
class.1 class.35
1 0 0
2 1 0
3 0 3
4 1 3
5 0 4
6 1 4
7 0 3, 4
8 1 3, 4
get_pos_combn_with_infos <- function(seq, chars, p_name) {
cbind.data.frame(p_name, seq, get_pos_combn(seq, chars))
}
get_pos_combn_with_infos(x, chars, p_name)
p_name seq class.1 class.35
1 P1 MAKK 0 0
2 P1 MAKK 1 0
3 P1 MAKK 0 3
4 P1 MAKK 1 3
5 P1 MAKK 0 4
6 P1 MAKK 1 4
7 P1 MAKK 0 3, 4
8 P1 MAKK 1 3, 4
So, now if you want me to finish my answer, I would need to know which are the chars and classes corresponding to your complete example
p_list = list( "P1" = list( c("1MAK","ERTD","FTRWDSE" )) ,
"P2" = list( c("1MERTDF","DFRGRSDFG","DFFF")) )
Also, are you sure you want to make "P1" and "P2" lists of length 1?

Resources