Variable Names based on Number in Character Position in R - r

I'm currently trying to change variable names based on the number in each position of the string.
variables <- c("X0.0.1", "X0.1.0", "X1.0.0", "X0.0.2", "X0.1.1", "X0.2.0", "X1.0.1",
"X1.1.0", "X2.0.0", "X0.0.3", "X0.1.2", "X0.2.1", "X0.3.0", "X1.0.2", "X1.1.1", "X1.2.0",
"X2.0.1","X2.1.0","X3.0.0")
Ideally, I'd have something similar to "X0.0.1" = "x", "X0.1.0" = "y", "X1.0.0" = "z", "X0.0.2" = "xx" is there a way to quickly duplicate the variable if there is a 2 in the position of that number? Or even "X3.0.0" = "zzz"?

I believe the following code does what the question asks for. It uses rep to get the repetitions and then paste them together.
s <- strsplit(substring(variables, 2), "\\.")
sapply(s, function(x){
vec <- c("x", "y", "z")[seq_along(x)]
x <- as.integer(x)
y <- rep(vec, rev(x))
paste(y, collapse = "")
})
# [1] "x" "y" "z" "xx" "xy" "yy" "xz" "yz" "zz" "xxx"
#[11] "xxy" "xyy" "yyy" "xxz" "xyz" "yyz" "xzz" "yzz" "zzz"
Edit.
The following function tries to answer the dialog in the comments. It returns a data.frame with the strings and their degrees. Then it's a matter of sorting by degree/chr.
changeVariable <- function(x, chr = c("x", "y", "z")){
s <- strsplit(substring(x, 2), "\\.")
y <- lapply(s, function(.x){
vec <- chr[seq_along(.x)]
.x <- as.integer(.x)
.y <- rep(vec, rev(.x))
list(chr = paste(.y, collapse = ""),
degree = sum(.x)
)
})
res <- do.call(rbind.data.frame, y)
row.names(res) <- NULL
res
}
res <- changeVariable(variables)
res[order(res$degree, res$chr), ]
Edit 2.
With results pasted with superscripts:
changeVariable2 <- function(x){
s <- strsplit(substring(x, 2), "\\.")
y <- lapply(s, function(.x){
vec <- c("x", "y", "z")[seq_along(.x)]
.x <- rev(as.integer(.x))
.y <- vec[.x != 0]
.x <- .x[.x != 0]
list(chr = paste0(.y, "^", .x, collapse = " "),
degree = sum(.x)
)
})
res <- do.call(rbind.data.frame, y)
row.names(res) <- NULL
res
}

Related

How to find a match within a string based on character from other column?

I have these data:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
And I want to make a new column called "match" which gives me the letter in column "ca" that is equal to the letter in column "ea". So my output would look like this:
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
It is tricky because in the first instance the letter I want to match on is after the "=", but in the second instance it precedes it.
Here's another tidyverse solution that might a little bit simpler using regular expressions. You need to have R > 4.0 to use the |> pipe operator, if that's not the case just substitute it by %>%.
library(tidyverse)
df1 |>
# add a named match column as an extracted string by the following
# two possible patterns
mutate(match = str_extract(ca,
# Search for the letter preceded by ea=
paste0(paste0("(?<=",ea,"\\=)","[A-Z]"),
# or
"|",
# search for the letter followed by =ea
paste0("[A-Z]","(?=\\=",ea,")"))))
# ca ea match
# 1 A=C,T=G G T
# 2 T=C,G=G T C
I believe this should work for you :
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
my_f <- function(x) {
my_pattern <- paste("[ACGT]=", df1[x, "ea"], "|", df1[x, "ea"], "=[ACGT]", sep
= "")
my_a <- str_extract_all(string = df1[x, "ca"], pattern = my_pattern, simplify = TRUE)
my_pattern <- paste(df1[x, "ea"], "|=", sep = "")
my_a <- gsub(pattern = my_pattern, replacement = "", x = my_a)
return (my_a)
}
df1$match <- lapply(1:nrow(df1), my_f)
Good opportunity for a simple branch reset group
df1 <- data.frame(matrix(, nrow=2, ncol=2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1$match <- c("T", "C")
mapply(
function(p, x)
gsub(sprintf('(?|%s=(.)|(.)=%s)|.', p, p), '\\1', x, perl = TRUE),
df1$ea, df1$ca, USE.NAMES = FALSE
)
# [1] "T" "C"
library(tidyverse)
df1 <- data.frame(matrix(, nrow = 2, ncol = 2))
colnames(df1) <- c("ca", "ea")
df1$ca <- c("A=C,T=G", "T=C,G=G")
df1$ea <- c("G", "T")
df1
#> ca ea
#> 1 A=C,T=G G
#> 2 T=C,G=G T
df1 %>%
mutate(
match = ea %>% map2_chr(ca, function(ea, ca) {
ca %>%
str_split(",") %>%
simplify() %>%
keep(~ str_detect(.x, ea)) %>%
str_remove_all(str_glue("[=|{ea}]"))
})
)
#> ca ea match
#> 1 A=C,T=G G T
#> 2 T=C,G=G T C
Created on 2021-12-08 by the reprex package (v2.0.1)

Dynamic formula not working with startsWith and colnames

I'm working on making a function to create tables and I need to have some conditional rules involved for formatting. One will be based on a column name, however when I send it down using as.formula it seems to be over doing it. I've made an example here:
library(tidyverse)
library(rlang)
a <- as_tibble(x =cbind( Year = c(2018, 2019, 2020), a = 1:3,
b.1 = c("a", "b", "c"),
b.2 = c("d", "e", "f"),
fac = c("This", "This","That")))
foo <- function(x, y, z, ...){
y_var <- enquo(y)
x %>%
filter(Year %in% c(2018, 2019),
...) %>%
mutate(!!quo_name(y_var) := factor(!!y_var,
levels = z,
ordered = TRUE)) %>%
arrange(!!y_var)
}
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- paste("~!is.na(", quo_name(y_var),")")
cond.2 <- paste("~startsWith(colnames(", df.in, "),\"b\")")
flextable(df.in) %>%
bold(i = as.formula(cond),
part = "body") %>%
bg(i = as.formula(cond.2),
bg = "Red3",
j = as.formula(cond.2))
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
Error in startsWith(colnames(2:3), "b") : non-character object(s)
From the error I've been reviving it looks like solved the expression before it gets put through the as.formula as those two columns are the correct answer.
Proof:
df.in <- foo(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
startsWith(colnames(df.in), prefix = "b")
[1] FALSE FALSE TRUE TRUE FALSE
What am I missing here? If anyone has a solution, or suggestion on how to do things differently potentially using quosures or other tidyverse friendly methods I would much appreciate it.
Extension:
To make this a bit more clear, I may need to elaborate on my intended use of this example. I'm trying to figure out how to take names generated dynamically in a function represented as foo that start with a specified value (generally 3 columns), and then check those columns for a specified value that I can then highlight in a specific Color.
Additionally in the answer cond is used in both of the i= designation, the two separate conditions in will likely never overlap.
We could specify the j with the column names of the data created i.e. startsWith returns a logical vector from the column names based on the names that starts with 'b', use the logical vector to extract the column names with [ (nm1).
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
-testing
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
In the OP's post formula created for 'cond' is fine although it is a bit more flexible by using glue whereas the second one i.e. 'cond.2' returns
paste("~startsWith(colnames(", df.in, "),\"b\")")
[1] "~startsWith(colnames( 2:3 ),\"b\")" "~startsWith(colnames( c(\"1\", \"2\") ),\"b\")"
[3] "~startsWith(colnames( c(\"a\", \"b\") ),\"b\")" "~startsWith(colnames( c(\"d\", \"e\") ),\"b\")"
[5] "~startsWith(colnames( c(\"This\", \"This\") ),\"b\")"
It is because df.in is a data.frame on which we are trying to paste the startsWith(colnames( string. Each of the lines returned are the column values
If we want to get either 'a' or 'b' column names prefix with 'red' color, change the startsWith to grep which can take a regex as pattern
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- grep("^(a|b)", names(df.in), value = TRUE)
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
If we want to color based on the value of 'a'
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = ~ a == 2,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output

Is there a way I can simplify the code below using vectors?

I am using R. I need to create a new column in a data frame that is the sum of the three variables. The sum should only take place if there are numeric values for each of the three variables. In other words, if there are any NAs or blanks the sum should not take place.
I have written the code below which works, but would like to simplify it. I am interested in using vectors to avoid repetition in my code.
data.x <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "))
data.x[data.x == ''] <- 'NA'
data.x[data.x == ' '] <- 'NA'
data.x[data.x == 'ND'] <- 'NA'
data.x.na.omit <- na.omit(data.x)
data.y <- data.frame('time' = c(1:8),
'y' = c(5,2,3,1,2,NA,NA,8))
data.y[data.y == ''] <- 'NA'
data.y[data.y == ' '] <- 'NA'
data.y[data.y == 'ND'] <- 'NA'
data.y.na.omit <- na.omit(data.y)
data.z <- data.frame('time' = c(1:5),
'z' = c(1:5))
data.z[data.z == ''] <- 'NA'
data.z[data.z == ' '] <- 'NA'
data.z[data.z == 'ND'] <- 'NA'
data.z.na.omit <- na.omit(data.z)
data.x.y <- merge.data.frame(data.x.na.omit, data.y.na.omit, by.x = "time", by.y = "time")
data.x.y.z <- merge.data.frame(data.x.y, data.z.na.omit, by.x = "time", by.y = "time" )
data.x.y.z$x <- as.numeric(data.x.y.z$x)
data.x.y.z$y <- as.numeric(data.x.y.z$y)
data.x.y.z$z <- as.numeric(data.x.y.z$z)
data.x.y.z$result <- data.x.y.z$x + data.x.y.z$y + data.x.y.z$z
I don't see particularly good ways to use vectors to avoid repetition. I would suggest the following, though:
Removing NA rows by evaluating the result column once, so you don't have to do this for each of x, y and z.
Setting stringsAsFactors to FALSE so using a single line like data.x$x <- as.numeric(data.x$x) will automatically coerce strings to NA, and you don't have to do it separately.
Bringing in the data as a single dataframe (by adding NA to the bottom of columns y and z), rather than creating data.x, data.y and data.z then merging.
For example, code with these suggestions might look like this:
# Create merged data
data <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "),
'y' = c(5,2,3,1,2,NA,NA,8, rep(NA, 3)),
'z' = c(1:5, rep(NA, 6)),
stringsAsFactors=F)
# Convert x, y and z to numeric
for(col in c("x", "y", "z"))
class(data[,col]) <- "numeric"
# Add x, y and z together
data$result <- data$x + data$y + data$z
# Remove NAs at the end
data <- na.omit(data)
If your data sources are such that you can't bring them in as a single dataframe, but you have to merge them, then you could replace the "Create merged data" section with something like this:
# Create separate data
data.x <- data.frame('time' = c(1:11),
'x' = c(5,3,"",'ND',2,'ND',7,8,'ND',1," "),
stringsAsFactors=F)
data.y <- data.frame('time' = c(1:8),
'y' = c(5,2,3,1,2,NA,NA,8),
stringsAsFactors=F)
data.z <- data.frame('time' = c(1:5),
'z' = c(1:5),
stringsAsFactors=F)
# Merge data
data.xy <- merge(data.x, data.y)
data <- merge(data.xy, data.z)
# Now continue main code suggestion from the 'Convert x, y and z to numeric' section

Diff-in-diff estimation with resampling from large dataset

I have a large dataset on which to perform a diff-in-diff estimation. Given the nature of the dataset my t-statistics denominators are inflated and coefficient are (surreptitiously) statistically significant.
I want to step-by-step reducing the number of element in the database, and for each step resample a large number of times and re-estimating each time interaction coefficient and standard errors.
Then I want to take all the averages estimates and standard error, and plot them on a graph, to show at what point (if any) they are not statistically different from zero.
My code follows with a toy example.
I am not sure this is the most efficient way to tackle the problem
I cannot retrieve and thus plot the confidence interval
I am not sure the sampling is representative given the existence of different groups.
Toy example (Creds Torres-Reyna - ‎2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
Consider the following refactored code using base R functions: within, %in%, nested lapply, setNames, aggregate, and do.call. This approach avoids calling rbind in a loop and compactly re-writes code without constantly using $ column referencing.
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})
In the end I solved it like this:
Is this the most efficient way?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)

Subset data.table columns independently

I'm starting with the below table dt and try to subset its column by the list keys:
library(data.table)
set.seed(123)
randomchar <- function(n, w){
chararray <- replicate(w, sample(c(letters, LETTERS), n, replace = TRUE))
apply(chararray, 1, paste0, collapse = "")
}
dt <- data.table(x = randomchar(1000, 3),
y = randomchar(1000, 3),
z = randomchar(1000, 3),
key = c("x", "y", "z"))
keys <- with(dt, list(x = sample(x, 501),
y = sample(y, 500),
z = sample(z, 721)))
I can get the result I want by using a loop:
desired <- copy(dt)
for(i in seq_along(keys)){
keyname <- names(keys)[i]
desired <- desired[get(keyname) %in% keys[[i]]]
}
desired
The question is - Is there a more data.table idiomatic way to do this subset?
I tried using CJ: dt[CJ(keys)], but it takes a very long time.
What about building a mask and filter dt on this mask:
dt[Reduce(`&`, Map(function(key, col) col %in% key, keys, dt)),]

Resources