Creating a vector of random strings with conditions in R - r

The problem is :
Create a vector of 50 random names, each made of 5 letters,out of which only the first is capital. The first, third and fifthletters are consonants and the second and fourth are vowels.
How can i do this?

Use letters and LETTERS which are built in constants in R, then define vowels as c(1, 5, 9, 15, 21), which is the subset to select or remove. Use sample to get 50 of each with replacement, and paste them together:
set.seed(69)
vowels <- c(1, 5, 9, 15, 21)
paste0( sample(LETTERS[-vowels], 50, TRUE),
sample(letters[vowels], 50, TRUE),
sample(letters[-vowels], 50, TRUE),
sample(letters[vowels], 50, TRUE),
sample(letters[-vowels], 50, TRUE))
#> [1] "Valif" "Cirer" "Tuniw" "Kimil" "Qehoc" "Jemif" "Senoy" "Jazic" "Hihuy"
#> [10] "Cezor" "Fuzic" "Menas" "Covay" "Rupov" "Xanij" "Pujur" "Qimin" "Dunop"
#> [19] "Xokez" "Zacox" "Muhac" "Yitab" "Gojob" "Dedah" "Nepan" "Dinel" "Ceyaw"
#> [28] "Foxiv" "Fiven" "Zotob" "Bezug" "Pusod" "Jawad" "Suluq" "Zubic" "Minax"
#> [37] "Gowex" "Debec" "Xaqut" "Duvov" "Lalal" "Zavuv" "Xobuk" "Zugil" "Gibac"
#> [46] "Yocan" "Voyuh" "Nigeh" "Yuqew" "Humup"
Created on 2020-04-05 by the reprex package (v0.3.0)

Just using a for loop to sample the built in vectors LETTERS and letters with conditional subsetting to subset to vowels and consonants
namesList <- list()
for(i in 1:50){
namesList[[i]] <- paste(c(sample(LETTERS[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 1),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 1)), collapse = "")
}
namesVec <- unlist(namesList)
Or in a vectorised fashion (which is better)
paste(
sample(LETTERS[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sample(letters[!(LETTERS %in% c("A", "E", "I", "O", "U"))], 50, replace = TRUE),
sep = "")

Related

Separate entries in dataframe in new rows in R [duplicate]

This question already has answers here:
Split comma-separated strings in a column into separate rows
(6 answers)
Closed 28 days ago.
I have data.frame df below.
df <- data.frame(id = c(1:12),
A = c("alpha", "alpha", "beta", "beta", "gamma", "gamma", "gamma", "delta",
"epsilon", "epsilon", "zeta", "eta"),
B = c("a", "a; b", "a", "c; d; e", "e", "e", "c; f", "g", "a", "g; h", "f", "d"),
C = c(NA, 4, 2, 7, 4, NA, 9, 1, 1, NA, 3, NA),
D = c("ii", "ii", "i", "iii", "iv", "v", "viii", "v", "viii", "i", "iii", "i"))
Column 'B' contains four entries with semicolons. How can I copy each of these rows and enter in column 'B' each of the separate values?
The expected result df2 is:
df2 <- data.frame(id = c(1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12),
A = c(rep("alpha", 3), rep("beta", 4), rep("gamma", 4), "delta", rep("epsilon", 3),
"zeta", "eta"),
B = c("a", "a", "b", "a", "c", "d", "e", "e", "e", "c", "f", "g", "a", "g", "h", "f", "d"),
C = c(NA, 4, 4, 2, 7, 7, 7, 4, NA, 9, 9, 1, 1, NA, NA, 3, NA),
D = c("ii", "ii", "ii", "i", "iii", "iii", "iii", "iv", "v", "viii", "viii", "v", "viii", "i", "i", "iii", "i"))
I tried this, but no luck:
df2 <- df
# split the values in column B
df2$B <- unlist(strsplit(as.character(df2$B), "; "))
# repeat the rows for each value in column B
df2 <- df2[rep(seq_len(nrow(df2)), sapply(strsplit(as.character(df1$B), "; "), length)),]
# match the number of rows in column B with the number of rows in df2
df2$id <- rep(df2$id, sapply(strsplit(as.character(df1$B), "; "), length))
# sort the dataframe by id
df2 <- df2[order(df2$id),]
We may use separate_rows here - specify the sep as ; followed by zero or more spaces (\\s*) to expand the rows
library(tidyr)
df_new <- separate_rows(df, B, sep = ";\\s*")
-checking with OP's expected
> all.equal(df_new, df2, check.attributes = FALSE)
[1] TRUE
In the base R, we may replicate the sequence of rows by the lengths of the list output
lst1 <- strsplit(df$B, ";\\s+")
df_new2 <- transform(df[rep(seq_len(nrow(df)), lengths(lst1)),], B = unlist(lst1))
row.names(df_new2) <- NULL

How to replace the wild card characters with sampled characters in R

I have the following sequence:
s0 <- "KDRH?THLA???RT?HLAK"
The wild card character there is indicated by ?.
What I want to do is to replace that character by sampled character from this vector:
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
Since s0 has 5 wild cards ?, I would sample from AADict:
set.seed(1)
nof_wildcard <- 5
tolower(sample(AADict, nof_wildcard, TRUE))
Which gives [1] "d" "q" "a" "r" "l"
Hence the expected result is:
KDRH?THLA???RT?HLAK
KDRHdTHLAqarRTlHLAK
So the placement of the sampled character must be exactly in the same position as ?, but the order of the character is not important.
e.g. this answer is also acceptable: KDRHqTHLAdlaRTrHLAK.
How can I achieve that with R?
The other example are:
s1 <- "FKDHKHIDVKDRHRTHLAK????RTRHLAK"
s2 <- "FKHIDVKDRHRTRHLAK??????????"
One approach is to replace the "?" characters 'one at a time' using a loop, e.g.
s0 <- "KDRH?THLA???RT?HLAK"
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
s0
#> [1] "KDRH?THLA???RT?HLAK"
repeat{s0 <- sub("\\?", sample(tolower(AADict), 1), s0); if(grepl("\\?", s0) == FALSE) break}
s0
#> [1] "KDRHtTHLAidwRTyHLAK"
s1 <- "FKDHKHIDVKDRHRTHLAK????RTRHLAK"
repeat{s1 <- sub("\\?", sample(tolower(AADict), 1), s1); if(grepl("\\?", s1) == FALSE) break}
s1
#> [1] "FKDHKHIDVKDRHRTHLAKrstaRTRHLAK"
s2 <- "FKHIDVKDRHRTRHLAK??????????"
repeat{s2 <- sub("\\?", sample(tolower(AADict), 1), s2); if(grepl("\\?", s2) == FALSE) break}
s2
#> [1] "FKHIDVKDRHRTRHLAKdvcfmheiqn"
Another approach which can also allow for sampling without replacement:
s0 <- "KDRH?THLA???RT?HLAK"
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
matches <- gregexpr("\\?", s0)
regmatches(s0, matches) <- lapply(lengths(matches), sample, x = tolower(AADict), replace = FALSE)
s0
#> [1] "KDRHdTHLAlanRTiHLAK"
Created on 2022-10-22 by the reprex package (v2.0.1)
You could split your string in single characters which makes it easy to replace the wildcard without the need of a loop (was my first approach):
replace_wc <- function(x, dict) {
x <- strsplit(x, split = "")[[1]]
ix <- grepl("\\?", x)
x[ix] <- sample(dict, sum(ix), replace = TRUE)
return(paste0(x, collapse = ""))
}
s0 <- "KDRH?THLA???RT?HLAK"
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
set.seed(1)
replace_wc(s0, tolower(AADict))
#> [1] "KDRHdTHLAqarRTlHLAK"
Here is a vectorized function to replace the "?" characters in a vector of strings.
fun <- function(x, dict = AADict) {
dict <- tolower(dict)
inx <- gregexpr("\\?", x)
sapply(seq_along(x), \(j) {
for(i in inx[[j]]) {
substr(x[j], i, i) <- sample(dict, 1L)
}
x[j]
})
}
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
s0 <- "KDRH?THLA???RT?HLAK"
s1 <- "FKDHKHIDVKDRHRTHLAK????RTRHLAK"
s2 <- "FKHIDVKDRHRTRHLAK??????????"
fun(s0)
#> [1] "KDRHsTHLAwppRTwHLAK"
fun(s1)
#> [1] "FKDHKHIDVKDRHRTHLAKyfqfRTRHLAK"
fun(s2)
#> [1] "FKHIDVKDRHRTRHLAKnsfehqwmkv"
fun(c(s0, s1, s2))
#> [1] "KDRHiTHLAdssRTgHLAK" "FKDHKHIDVKDRHRTHLAKcdivRTRHLAK"
#> [3] "FKHIDVKDRHRTRHLAKfrpafwpnif"
Created on 2022-10-22 with reprex v2.0.2

Counting the occurrence of a word but only once per row (R)

I want to count the number of times a word appears but only once per row. How do I complete the code?
library(stringr)
var1 <- c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y")
var2 <- c("x", "x", "b", "b", "c", "d", "e", "y", "g", "h")
var3 <- c("x", "x", "b", "b", "c", "d", "e", "y", "g", "h")
data <- data.frame(cbind(var1, var2, var3))
sum(str_count(data, "x"))
The result should be 6.
The following should do the trick:
sum(rowSums(data == "x") >= 1) # Thanks Maƫl
# [1] 6
which will check if there is at least one value per row (rowSums()) and add all the rows with 1+ up using sum()
Or alternatively (per Antreas's comment so it is not missed):
length(which(rowSums(data == "x") != 0)) # Thanks to Antreas Stefopoulos
Which counts the number of non-zero rows with length()

Adding extra track to outside of circos plot (circlize, chordDiagram)

I'm trying to recreate this figure below, where the "to" variable (i.e. target genes) is further grouped into outer (labelled) categories (i.e. receptors).
I have generated some example data, unfortunately I'm not sure what format is needed for the additional outer categories, but it's possibly not far off the link format.
library(circlize)
links <- data.frame(from = c("A", "B", "C", "B", "C"),
to = c("D", "E", "F", "D", "E"),
value = c(1, 1, 1, 1, 1))
categories <- data.frame(from = c("D", "E", "F", "D", "E"),
to = c("X", "X", "Y", "Y", "Y"),
value = c(1, 1, 1, 1, 1))
chordDiagram(links)
Any assistance greatly appreciated!

Error for graph saving loop - Must be length 1 (a summary value)

I'm trying to create and save graphs for individual organizations. I keep getting an error that says "Error in summarise_impl(.data, dots) :
Column Improved must be length 1 (a summary value), not 0"
The graphs work when I combine all the organizations together, so I'm not sure what is going on here!
Starting with this data:
library(ggpubr)
structure(list(Organization = c("A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"
), imp_imp20_Improved = c(55.6, 100, 50, 0, 57.1, 0, 0, 45, 50,
60, 100, 50, 66.7, 66.7, 33.3, 0, 50, 0, 50)), row.names = c(NA,
-19L), class = c("tbl_df", "tbl", "data.frame"))
org<- c("A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"
)
This is my code for the graph loop:
for(i in org) {
tiff(paste0("//graphs/",i,"_graph11.tiff"), units="in", width=3.5, height=3, res=300)
indicator_graph1<- indicators_ong %>%
filter(Organization==i) %>%
summarise(Improved = imp_imp20_Improved,
"Not Improved" = 100-imp_imp20_Improved)%>%
gather(key="group") %>%
arrange(desc(group))
labs <- paste0(indicator_graph1$group, "\n (", indicator_graph1$value,"%)")
z <- ggpie(indicator_graph1,"value",label=labs, fill= "group", color = "black", palette = c("darkgoldenrod1","azure3"), lab.pos = "in", lab.font = c(3,"black"),title="Improve 20")+
theme(legend.position ="none")+
font("title", size=10, hjust=0.5)
print(z)
dev.off()
}

Resources