If a row matches a criteria do paste in R - r

Let's imagine you have a dataframe with two columns ID and POSITION. I want to paste some text depending on the ID value.
I want to paste the ID value with GK0000 (when ID>10) or GK00000 (when ID<10) along with .2:, POSITION value, .. and the following POSITION value (POSITION+1)
For example if ID = 1 and POSITION = 10, the result would be GK000001.2:10..11 and if ID = 10 and POSITION = 10, the result would be GK000010.2:10..11
In Excel I can do this being A as ID and B as POSITION using =IF(A2<10,CONCATENATE("GK00000",A2,".2:",B2,"..",B2+1),CONCATENATE("GK0000",A2,".2:",B2,"..",B2+1)) but I want to add it to my R script line.
I give you a short example of my input data just ilustrative
ID <- c(1,5,9,10,12)
POSITION <- c(10,50,90,100,120)
df <- cbind(ID,POSITION)
and the result I'm expecting is
CONCAT <- c("GK000001.2:10..11","GK000005.2:50..51","GK000009.2:90..91",
"GK000010.2:100..101","GK000012.2:120..121")
dfResult <- cbind(ID,POSITION,CONCAT)

I believe the question asks for a string format given two arguments, A and B and a number of digits.
concat <- function(A, B, digits = 6){
fmt <- paste0("%0", digits, "d")
fmt <- paste0("GK", fmt, ".2:%d..%d")
sprintf(fmt, A, B, B + 1)
}
concat(df[, 'ID'], df[, 'POSITION'], 6)
# [1] "GK000001.2:10..11" "GK000002.2:20..21" "GK000003.2:30..31"
# [4] "GK000004.2:40..41" "GK000005.2:50..51" "GK000006.2:60..61"
# [7] "GK000007.2:70..71" "GK000008.2:80..81" "GK000009.2:90..91"
#[10] "GK000010.2:100..101" "GK000011.2:110..111" "GK000012.2:120..121"

Related

Change the value of a low frequency column to a desired value

In my data below, I want to replace any value in a column (excluding the first column) that occurs less than two times (ex. 'greek' in column L1, and 'german' in column L2) to "others".
I have tried the following, but don't get the desired output. Is there a short and efficient way to do this in R?
data <- data.frame(study=c('a','a','b','c','c','d'),
L1= c('arabic','turkish','greek','arabic','turkish','turkish'),
L2= c(rep('english',5),'german'))
# I tried the following without success:
dd[-1] <- lapply(names(dd)[-1], function(i) ifelse(table(dd[[i]]) < 2,"others",dd[[i]]))
forcats has specific function for this:
dd = data
dd[-1] = lapply(dd[-1], forcats::fct_lump_min, min = 2, other_level = "others")
dd
# study L1 L2
# 1 a arabic english
# 2 a turkish english
# 3 b others english
# 4 c arabic english
# 5 c turkish english
# 6 d turkish others
Your approach fails because ifelse() returns a vector the same length as the test, which in your case is the table, but the way you are using it you are assigning to the whole column so it needs to return something the same length as the whole column.
We can fix it like this:
dd[-1] <- lapply(names(dd)[-1], function(i) {
tt = table(dd[[i]])
drop = names(tt)[tt <= 2]
ifelse(dd[[i]] %in% drop, "others", dd[[i]])
})

Split string according to ambiguous delimiter in R

I have a pairs of strings included in a data frame:
df <- data.frame(str = c("L_V1_ROI-L_MST_ROI",
"L_V6_ROI-L_V2_ROI",
"L_V3_ROI-L_V4_ROI",
"L_V8_ROI-L_4_ROI",
"L_p9-46v_ROI-L_a9-46v_ROI"))
Each pair is separated by - symbol with the exception of the last pair which contains three - symbols and should be separated into substrings L_p9-46v_ROI and L_a9-46v_ROI.
A task is to split these pairs into substrings according to the separator. To do this I simply use:
library(tidyr)
df %>% separate(data = df, col = str, into = c("str1", "str2"), sep = "-")
which gives the following result:
str1 str2
1 L_V1_ROI L_MST_ROI
2 L_V6_ROI L_V2_ROI
3 L_V3_ROI L_V4_ROI
4 L_V8_ROI L_4_ROI
5 L_p9 46v_ROI
Warning message:
Too many values at 1 locations: 5
As expected, the problem lies in the 5th pair which has more than one - symbol.
Question: what is the regex to match the proper separator?
My partial solution is pasted below, but I hope that there should be more intelligent solution.
my_split <- function(string, pattern) {
## Match start end end position of the "_ROI-"
position <- str_locate(string = string, pattern = pattern)
start <- position[1]
end <- position[2]
## Extract substrings
substring1 <- substr(my_str, 1, start + 3)
substring2 <- substr(my_str, end + 1, nchar(string))
return(list(substring1, substring2))
}
## Toy example
my_str <- "L_p9-46v_ROI-L_a9-46v_ROI"
my_split(string = my_str, pattern = "_ROI-")
[[1]]
[1] "L_p9-46v_ROI"
[[2]]
[1] "L_a9-46v_ROI"

Extract only values with a decimal point in between from strings

I have a dataframe with strings such as:
id <- c(1,2)
x <- c("...14.....5.......................395.00.........................14.........1..",
"......114.99....................124.99................")
df <- data.frame(id,x)
df$x <- as.character(df$x)
How can I extract only values with a decimal point in between such as 395.00, 114.99 and 124.99 and not 14, 5, or 1 for each row, and put them in a new column separated by a comma?
The ideal result would be:
id x2
1 395.00
2 114.99,124.99
The amount of periods separating the values are random.
library(stringr)
df$x2 = str_extract_all(df$x, "[0-9]+\\.[0-9]+")
df[c(1, 3)]
# id x2
# 1 1 395.00
# 2 2 114.99, 124.99
Explanation: [0-9]+ matches one or more numbers, \\. matches a single decimal point. str_extract_all extracts all matches.
The new column is a list column, not a string with an inserted comma. This allows you access to the individual elements, if needed:
df$x2[2]
# [[1]]
# [1] "114.99" "124.99"
If you prefer a character vector as the column, do this:
df$x3 = sapply(str_extract_all(df$x, "[0-9]+\\.[0-9]+"), paste, collapse = ",")
df$x3[2]
#[1] "114.99,124.99"

r - how to use a variable in a variable

I have a data frame where the row names are words and I can call the first column of that row of data drame using something like
>df['rowB',1]
i know I can use paste to combine a variable and a string using paste to do something like
>paste("the value is ", df['rowB',1], "."]
and that will get me an output of the string with the value of the variable. what if rowname is a variable that equals 'rowB? I tried to do a first paste to put in the paste above, but the result of the first paste doesn't evaulate to the value, but rather is just a string that says
>rowname<-'rowB'
>type<-paste("relatype[\'", rowname, "\',1]", sep="")
'df['rowB',1]'
long story short, I want to input a value called 'rowname' as a parameter of a function and have it be evaluated for the value of rowname, so I can then put that value into a string within that same function.
I'm also open to a wholly different solution. any and all suggestions are welcome.
thanks
Not sure what the problem might be, not entirely clear from your description, but if rowname is a variable, you don't need anything special, because it will evaluate to it's value anyway. Let
mat <- matrix(1:10, nrow = 5)
rownames(mat) <- letters[1:5]
mat
## [,1] [,2]
##a 1 6
##b 2 7
##c 3 8
##d 4 9
##e 5 10
and rowname <- "b", then
rowname
##[1] "b"
so
mat[rowname, 1]
##b
##2
which is the same as mat["b", 1]. It only fails, if you use mat['rowname', 1].
If you want to put this in functions, you can do something like:
getElement <- function(mat, row.name, column.index) {
mat[row.name, column.index]
}
getElement(mat, "b", 1)
##b
##2
pasteSenstence <- function(mat, row.name, col.index) {
paste("The element of row", row.name, "and column", col.index, "is",
getElement(mat, row.name, col.index))
}
pasteSentence(mat, "b", 1)
##[1] "The element of row b and column 1 is 2"
which also works with rowname <- "b"
pasteSentence(mat, rowname, 1)
##[1] "The element of row b and column 1 is 2"
This should work:
paste("the value is ", get(df['rowname',1]), "."]
If you are not familiar, 'get' in r is similar to 'eval' in python.
x=c('a', 'c', 'b')
a=2
x[1]
'a'
get(x[1])
2
I'm afraid I don't understand the question; how is your function different from the following?
foo = function(rowname = "Species", d = t(iris)){
paste("I'm selecting", d[rowname, 1])
}
foo()
# [1] "I'm selecting setosa"

selecting n consequent grouped variables and apply the function in r

Here is example data:
myd <- data.frame (matrix (sample (c("AB", "BB", "AA"), 100*100,
replace = T), ncol = 100))
variablenames= paste (rep (paste ("MR.", 1:10,sep = ""),
each = 10), 1:100, sep = ".")
names(myd) <- variablenames
Each variable has a group, here we have ten groups. Thus the group index for the each variable in this data frame is as follows:
group <- rep(1:10, each = 10)
Thus Variable names and group
data.frame (group, variablenames)
group variablenames
1 1 MR.1.1
2 1 MR.1.2
3 1 MR.1.3
4 1 MR.1.4
5 1 MR.1.5
6 1 MR.1.6
7 1 MR.1.7
8 1 MR.1.8
9 1 MR.1.9
10 1 MR.1.10
11 2 MR.2.11
<<<<<<<<<<<<<<<<<<<<<<<<
100 10 MR.10.100
Each groups means that the following steps whould be applied to group of variables seperately.
I have longer function to work the following is short example:
function considering two variables at time
myfun <- function (x1, x2) {
out <- NULL
out <- paste(x1, x2, sep=":")
# for other steps to be performed here
return (out)
}
# group 1
myfun (myd[,1], myd[,2]); myfun (myd[,3], myd[,4]); myfun (myd[,5], myd[,6]);
myfun (myd[,7], myd[,8]); myfun (myd[,9], myd[,10]);
# group 2
myfun (myd[,11], myd[,12]); myfun (myd[,13], myd[,14]); .......so on to group 10 ;
In this way I need to walk for variables 1:10 (i.e. in first group to perform the above action), then 11:20 (the second group). The group doesnot matter in this case number of variables in each group are divisible with number of variables (10) taken (considered) at a time (2).
However in the following example where 3 variables taken at a time - number of total variable in each group (3), 10/3, you have one variable left over at the end.
function considering three variable at time.
myfun <- function (x1, x2, x3) {
out <- NULL
out <- paste(x1, x2, x3, sep=":")
# for other steps to be performed here
return (out)
}
# for group 1
myfun (myd[,1], myd[,2], myd[,3])
myfun (myd[,4], myd[,5], myd[,6])
myfun (myd[,7], myd[,8], myd[,9])
# As there one variable left before proceedomg to second group, the final group will
have 1 extra variable
myfun (myd[,7], myd[,8], myd[,9],myd[,10] )
# for group 2
myfun (myd[,11], myd[,12], myd[,13])
# and to the end all groups and to end of the file.
I want to loop this process by user defined n number of variables consered at time, where n may be 1 to maximum number of variables in each group.
Edit: Just illustration to show the process (just group 1 and 2 demostrated for example):
Create a function that will split your data up into appropriate lists, and apply whatever functions you want to your list.
This function will create your second grouping variable. (The first grouping variable (group) is provided in your question; if you change that value, you should also change DIM in the function below.)
myfun = function(LENGTH, DIM = 10) {
PATTERN = rep(1:(DIM %/% LENGTH), each=LENGTH)
c(PATTERN, rep(max(PATTERN), DIM %% LENGTH))
}
Here are the groups on which we will split myd. In this example, we are splitting myd first into 10-column groups, and each group into 3-column groups, except for the last group, which will have 4 columns (3+3+4 = 10).
NOTE: To change the number of columns you're grouping by, for example, grouping by two variables at a time, change group2 = rep(myfun(3), length.out=100) to group2 = rep(myfun(2), length.out=100).
group <- rep(1:10, each = 10)
# CHANGE THE FOLLOWING LINE ACCORDING
# TO THE NUMBER OF GROUPS THAT YOU WANT
group2 = rep(myfun(3), length.out=100)
This is the splitting process. We first split up just by names, and match those names with myd to create a list of data.frames.
# Extract group names for matching purposes
temp = split(names(myd), list(group, group2))
# Match the names to myd
temp = lapply(1:length(temp),
function(x) myd[, which(names(myd) %in% temp[[x]])])
# Extract the names from the list for future reference
NAMES = lapply(temp, function(x) paste(names(x), collapse="_"))
Now that we have a list, we can do lots of fun things. You wanted to paste your columns together separated by a colon. Here's how you'd do that.
# Do what you want with the list
# For example, to paste the columns together:
FINAL = lapply(temp, function(x) apply(x, 1, paste, collapse=":"))
names(FINAL) = NAMES
Here's a sample of the output:
lapply(FINAL, function(x) head(x, 5))
# $MR.1.1_MR.1.2_MR.1.3
# [1] "AA:AB:AB" "AB:BB:AA" "BB:AB:AA" "BB:AA:AB" "AA:AA:AA"
#
# $MR.2.11_MR.2.12_MR.2.13
# [1] "BB:AA:AB" "BB:AB:BB" "BB:AA:AA" "AB:BB:AA" "BB:BB:AA"
#
# $MR.3.21_MR.3.22_MR.3.23
# [1] "AA:AB:BB" "BB:AA:AA" "AA:AB:BB" "AB:AA:AA" "AB:BB:BB"
#
# <<<<<<<------SNIP------>>>>>>>>
#
# $MR.1.4_MR.1.5_MR.1.6
# [1] "AB:BB:AA" "BB:BB:BB" "AA:AA:AA" "BB:BB:AB" "AB:AA:AA"
#
# $MR.2.14_MR.2.15_MR.2.16
# [1] "AA:BB:AB" "BB:BB:BB" "BB:BB:AB" "AA:BB:AB" "BB:BB:BB"
#
# $MR.3.24_MR.3.25_MR.3.26
# [1] "AA:AB:BB" "BB:AA:BB" "BB:AB:BB" "AA:AB:AA" "AB:AA:AA"
#
# <<<<<<<------SNIP------>>>>>>>>
#
# $MR.1.7_MR.1.8_MR.1.9_MR.1.10
# [1] "AB:AB:AA:AB" "AB:AA:BB:AA" "BB:BB:AA:AA" "AB:BB:AB:AA" "AB:BB:AB:BB"
#
# $MR.2.17_MR.2.18_MR.2.19_MR.2.20
# [1] "AB:AB:BB:BB" "AB:AB:BB:BB" "AB:AA:BB:BB" "AA:AA:AB:AA" "AB:AB:AB:AB"
#
# $MR.3.27_MR.3.28_MR.3.29_MR.3.30
# [1] "BB:BB:AB:BB" "BB:BB:AA:AA" "AA:BB:AB:AA" "AA:BB:AB:AA" "AA:AB:AA:BB"
#
# $MR.4.37_MR.4.38_MR.4.39_MR.4.40
# [1] "BB:BB:AB:AA" "AA:BB:AA:BB" "AA:AA:AA:AB" "AB:AA:BB:AB" "BB:BB:BB:BB"
#
# $MR.5.47_MR.5.48_MR.5.49_MR.5.50
# [1] "AB:AA:AA:AB" "AB:AA:BB:AA" "AB:BB:AA:AA" "AB:BB:BB:BB" "BB:AA:AB:AA"
#
# $MR.6.57_MR.6.58_MR.6.59_MR.6.60
# [1] "BB:BB:AB:AA" "BB:AB:BB:AA" "AA:AB:AB:BB" "BB:AB:AA:AB" "AB:AA:AB:BB"
#
# $MR.7.67_MR.7.68_MR.7.69_MR.7.70
# [1] "BB:AB:BB:AA" "BB:AB:BB:AA" "BB:AB:BB:AB" "AB:AA:AA:AA" "AA:AA:AA:AB"
#
# $MR.8.77_MR.8.78_MR.8.79_MR.8.80
# [1] "AA:AB:AA:AB" "AB:AA:AB:BB" "BB:BB:AA:AB" "AB:BB:BB:BB" "AB:AA:BB:AB"
#
# $MR.9.87_MR.9.88_MR.9.89_MR.9.90
# [1] "AA:BB:AB:AA" "AA:AB:BB:BB" "AA:BB:AA:BB" "AB:AB:AA:BB" "AB:AA:AB:BB"
#
# $MR.10.97_MR.10.98_MR.10.99_MR.10.100
# [1] "AB:AA:BB:AB" "AB:AA:AB:BB" "BB:AB:AA:AA" "BB:BB:AA:AA" "AB:AB:BB:AB"
I suggest to recode myfun to take a matrix and use pasteCols from plotrix package.
library(plotrix)
myfun = function(x){
out = pasteCols(t(x), sep = ":")
# some code
return(out)
}
then, its very easy: for each group, compute the index of the first and of the last column you want to use when you call myfun, using modulus and integer division:
rubiques_solution = function(group, myd, num_to_group){
# loop over groups
for(g in unique(group)){
var_index = which(group == g)
num_var = length(var_index)
# test to make sure num_to_group is smaller than the number of variable
if(num_var < num_to_group){
stop("num_to_group > number of variable in at least one group")
}
# number of calls to myfun
num_calls = num_var %/% num_to_group
# the idea here is that we create the first and last column
# in which we are interested for each call
first = seq(from = var_index[1], by = num_to_group, length = num_calls)
last = first + num_to_group -1
# the last call will contain possibly more varialbe, we adjust here:
last[length(last)] = last[length(last)] + (num_var %% num_to_group)
for(i in num_calls){
# maybe do something with the return value of myfun ?
myfun(myd[,first[i]:last[i]])
}
}
}
group = rep(1:10, each = 10) # same than yours
myd = data.frame (matrix (sample (c("AB", "BB", "AA"), 100*100, replace = T), ncol = 100)) # same than yours
num_to_group = 2 # this is your first example
rubiques_solution(group, myd, num_to_group)
hope i understood the problem right.

Resources