Grepl first letters, two patterns to match - r

I have the following data frame. I want to create a column called A1B1 with a 1 if there are strings starting with A1 or B1 or both in the data frame and a zero if not.
What am I doing wrong here:
set.seed(1)
Data <- data.frame(id = seq(1, 10),
Diag1 = sample(c("A123", "B123", "C123"), 10, replace = TRUE),
Diag2 = sample(c("D123", "E123", "F123"), 10, replace = TRUE),
Diag3 = sample(c("G123", "H123", "I123"), 10, replace = TRUE),
Diag4 = sample(c("A123", "B123", "C123"), 10, replace = TRUE),
Diag5 = sample(c("J123", "K123", "L123"), 10, replace = TRUE),
Diag6 = sample(c("M123", "N123", "O123"), 10, replace = TRUE),
Diag7 = sample(c("P123", "Q123", "R123"), 10, replace = TRUE))
A1orB1 <- c("^A1", "^B1")
Data$A1B1 <- apply(Data[-1],1,function(x)as.integer(sum(grepl(paste(A1orB1,collapse="|"), x))==1))
I would expect to have a one with ID's (1,2,3,4,5,8,9,10) but I have only ones in 4,5 and 9.
Thanks!

At present you are computing the sum of the number of A1... or B1... strings and comparing it to 1.
Perhaps replace sum() with any() (and consider breaking your code up into a few more intermediate chunks to make it more readable ...). Or you could just replace ==1 with >=1 in your code.
ss <- paste(A1orB1,collapse="|")
ff <- function(x) as.integer(any(grepl(ss,x)))
Data$A1B1 <- apply(Data[-1],1,ff)

Related

For loop list of tibbles R

I want to create a list of random tibbles using a for loop. I have a large data set where I will need to apply functions to lists of tibbles and create lists of tibbles as the outputs. I understand there might be better ways to do this and would also appreciate hearing those but am trying to wrap my head around how for loops work.
I can create a list of random tibbles with each tibble in the list named:
tibble_random1 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random2 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random3 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random <- list(tibble1 = tibble_random1,
tibble2 = tibble_random2,
tibble3 = tibble_random3)
I cannot figure out how to do this with a for loop or if a for loop is completely inappropriate for this.
Thanks.
Initialise a list and fill 1 tibble in every iteration using for loop.
tibble_random <- vector('list', 3)
for(i in seq_along(tibble_random)) {
tibble_random[[i]] <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
}
You can also use replicate or lapply to do this without for loop.
tibble_random <- replicate(3, tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE)), simplify = FALSE)
To assign the names of the list you can use :
names(tibble_random) <- paste0('tibble', seq_along(tibble_random))

Arguments must have same length when using tapply

data.frame(q1 = sample(c(1, 5), 200, replace = T, prob = c(1/2, 1/2)),
gender = sample(c("M", "F"), 200, replace = T, prob = c(2/3, 1/3))
) %>% tapply(.$q1,list(.$gender),FUN=sum)
I just want to use tapply to sum by gender, but got error as below:
Error in tapply(., .$q1, list(.$gender), FUN = sum) :
arguments must have same length
Where's the problem?
For the sum example, you can use data.table syntax:
library(data.table)
df <- data.frame(q1 = sample(c(1, 5), 200, replace = T, prob = c(1/2, 1/2)),
gender = sample(c("M", "F"), 200, replace = T, prob = c(2/3, 1/3)))
as.data.table(df)[, sum(q1), by = gender]
This will also work with a function that has multiple return values, unlike my previous example with summarize:
as.data.table(df)[, shapiro.test(q1), by = gender]

Column name formatting in KableExtra in R

Latex command are not formatting the column names as I intended.
library(kableExtra)
kable(test,"latex", col.names = c('Mean','\\textit{N}' ,'Strongly\nDisagree','Disagree','Neither Agree\norDisagree','Agree','Strongly\nAgree'))
The output I am getting is:
https://www.dropbox.com/s/xvl7lfh94bl2274/Kable%20Table.PNG?dl=0
I have tried both latex commands and R-markdown commands.
The N should be italicized and Strong disagree, Neither Agree or Disagree, and Strongly Agree should be broken up on two lines.
test_data <- data.frame(Mean = runif(5, 3.71, 7.72),
N = sample(57:59, 5, replace = TRUE),
sd = c(1, rep(0, 4)),
d = rep(1, 5),
naod = sample(1:4, 5, replace = TRUE),
a = sample(5:12, 5, replace = TRUE),
sa = sample(37:44, 5, replace = TRUE))
kable(test_data,"latex" ,booktabs=T,
align="c",
col.names=linebreak(c('Mean','\\textit{N}' ,'Strongly\n Disagree','Disagree','Neither Agree\n or Disagree','Agree','Strongly\n Agree')),
row.names = T,escape=F) %>%
row_spec(0,align = "c")

A lm() dynamic function - R

Let's assume I have this dataframe:
N <- 50
df <- data.frame(
LA1 = sample(1:10, size = N, replace = TRUE),
LA2 = sample(1:10, size = N, replace = TRUE),
LA3 = sample(1:10, size = N, replace = TRUE),
LA4 = sample(1:10, size = N, replace = TRUE),
LA5 = sample(1:10, size = N, replace = TRUE),
LA6 = sample(1:10, size = N, replace = TRUE),
LA7 = sample(1:10, size = N, replace = TRUE),
LA8 = sample(1:10, size = N, replace = TRUE),
LAY = sample(1:10, size = N, replace = TRUE),
UF1 = sample(1:10, size = N, replace = TRUE),
UF2 = sample(1:10, size = N, replace = TRUE),
UF3 = sample(1:10, size = N, replace = TRUE),
UF4 = sample(1:10, size = N, replace = TRUE),
UF5 = sample(1:10, size = N, replace = TRUE),
UF6 = sample(1:10, size = N, replace = TRUE),
UFY = sample(1:10, size = N, replace = TRUE),
EK1 = sample(1:10, size = N, replace = TRUE),
EK2 = sample(1:10, size = N, replace = TRUE),
EK3 = sample(1:10, size = N, replace = TRUE),
EK4 = sample(1:10, size = N, replace = TRUE),
EK5 = sample(1:10, size = N, replace = TRUE),
EK6 = sample(1:10, size = N, replace = TRUE),
EK7 = sample(1:10, size = N, replace = TRUE),
EK8 = sample(1:10, size = N, replace = TRUE),
EK9 = sample(1:10, size = N, replace = TRUE),
EK10 = sample(1:10, size = N, replace = TRUE),
EK11 = sample(1:10, size = N, replace = TRUE),
EK12 = sample(1:10, size = N, replace = TRUE),
EKY = sample(1:10, size = N, replace = TRUE),
Z1 = sample(1:10, size = N, replace = TRUE),
Z2 = sample(1:10, size = N, replace = TRUE),
Z3 = sample(1:10, size = N, replace = TRUE)
)
Where I want to compute this models:
m1=lm(formula = LAY ~ LA1+LA2+LA3+LA4+LA5+LA6+LA7+LA8, data = df)
m11=step(m1,direction="both")
m2=lm(formula = UFY ~ UF1+UF2+UF3+UF4+UF5+UF6,data = df)
m22=step(m2,direction="both")
m3=lm(formula = EKY ~ EK1+EK2+EK3+EK4+EK5+EK6+EK7+EK8+EK9+EK10+EK11+EK12, data = df)
m33=step(m3,direction="both")
m8=lm(formula = Z1 ~ LAY+UFY+EKY, data = df)
m88=step(m8,direction="both")
m9=lm(formula = Z2 ~ LAY+UFY+EKY, data = df)
m99=step(m9,direction="both")
m10=lm(formula = Z3 ~ LAY+UFY+EKY, data = df)
m100=step(m10,direction="both")
As you can see, if the dimensionality of the database increases (increasing the number of LA, UF, or EK independent variables) I will have to modify manually the input for the models). So, I'm looking for a way to:
Given a certain quantity of independent variables (could be 5, 10, 30 or more) for a given category (LA, UF, and EK), the input for the model changes automatically.
Even I have found different syntax to compute the models (like X*Z = [(X+Z)^3]), I can't find a way to make this computation more dynamic.
Considerations:
The number of independent variables (LA, UF, EK) can change.
The number of dependent variables (LAY, UFY, EKY) never changes.
From the output of this models is extracted the coefficient vector (just in case this one).

apply statement to sample columns, across rows of different lengths

I'm trying to write a simple R function to sample 5-element substrings across two columns of a single data frame. The length of the strings are equal for each row, but they differ down the columns. The function works when I specify a row and col to act on, but I can't get the apply statement to work on on each row and each column. As written, it will only pull random samples based on the length of the first instance, so if the first instance is shorter than any of the other strings, the output for the other rows is sometimes less than 5-elements.
example df:
BP TF
1 CGTCTCTATTCTAGGCAAGA TTTFFFFTFFFTFFFTFTTT
2 AAGTCACTCGAATTCGGATGCCCCCTAGGC TTFFFFFTFFFFTTFTFFTTTFTTTTFTFF
3 TGCTCATGACGGGAC FFFTFTFFFFTFTFT
'intended output:'
1 CTATT FFTFF
2 CCTAG TTTFT
3 TCATG TFTFF
'reproducible example code:'
#make fake data frame
BaseP1 <- paste(sample(size = 20, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP2 <- paste(sample(size = 30, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP3 <- paste(sample(size = 15, x = c("A","C","T","G"), replace = TRUE), collapse = "")
TrueFalse1 <- paste(sample(size = 20, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse2 <- paste(sample(size = 30, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse3 <- paste(sample(size = 15, x = c("T","F"), replace = TRUE), collapse = "")
my_df <- data.frame(c(BaseP1,BaseP2,BaseP3), c(TrueFalse1, TrueFalse2, TrueFalse3))
Fragment = function(string) {
nStart = sample(1:nchar(string) -5, 1)
substr(string, nStart, nStart + 4)
}
Fragment(string = my_df[1,1])#works for the first row, first col.
but this does not work:
apply(my_df, c(1,2), function(x) Fragment(string = my_df[1:nrow(my_df),1:ncol(my_df)]))
There was an error in your function:
Fragment = function(string) {
nStart = sample(1:(nchar(string) -5), 1)
substr(string, nStart, nStart + 4)
}
It was missing parentheses between nchar(string) - 5, which made the subsetting go wrong.
You can then simply use apply(my_df, c(1,2), Fragment) as suggested in the comments.
To show that this works now:
for(i in 1:10000){
stopifnot(all(5 == sapply(apply(my_df, c(1,2), Fragment), nchar)))
}
This shows that in 10000 tries, it always produced 5 characters as output.

Resources