Display a table of strings and their variations per row (R) - r

For a large database, I would like to find a solution where I could predefine the strings to be searched and then get a table that would contain the frequency of these strings and their possible variations per row.
strings <- c("dog", "cat", "mouse")
var1 <- c("black dog", "white dog", "angry dog", "dogs and cats are nice", "dog")
var2 <- c("white cat", "black cat", "tiny cat", NA, "cow")
var3 <- c("little mouse", "big mouse", NA, NA, "mouse")
data <- data.frame(var1, var2, var3)
The result should look like this while I am looking for dog, cat and mouse:
dog&cat 4
mouse 3

We may try
v1 <- do.call(paste, data)
stack(setNames(lapply(c( "\\bdog.*\\bcat|\\bcat.*\\bdog", "mouse"),
\(pat) sum(grepl(pat, v1))), c("dog&cat", "mouse")))[2:1]
ind values
1 dog&cat 4
2 mouse 3
Or if we need all the combinations
lst1 <- lapply(c(strings, combn(strings, 2, FUN = \(x)
sprintf("\\b%1$s.*\\b%2$s|\\b%2$s.*\\b%1$s", x[1], x[2]))),
\(pat) sum(grepl(pat, v1)))
names(lst1) <- c(strings, combn(strings, 2, FUN = paste, collapse = "&"))
stack(lst1)[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
For more combinations, it may be better to use Reduce with individually applying grepl
lst1 <- lapply(1:3, \(n) {
vals <- colSums(combn(strings, n,
FUN = \(pats) Reduce(`&`, lapply(pats, \(pat) grepl(pat, v1)))))
nms <- combn(strings, n, FUN = paste, collapse = "&")
setNames(vals, nms)
})
stack(unlist(lst1))[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
7 dog&cat&mouse 2
Or with tidyverse
library(dplyr)
library(stringr)
library(tidyr)
data %>%
unite(var, everything(), na.rm = TRUE, sep = " ") %>%
summarise(`dog&cat` = sum(str_detect(var,
"\\bdog.*\\bcat|\\bcat.*\\bdog")),
mouse = sum(str_detect(var, 'mouse'))) %>%
pivot_longer(everything())
-output
# A tibble: 2 × 2
name value
<chr> <int>
1 dog&cat 4
2 mouse 3

Related

More efficient way to purrr::map2 for a large dataframe

Is there a faster way to do the following, where in the real application, df has many rows (and therefore list_of_colnames has the same number of elements):
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map2(split(df, seq(nrow(df))), list_of_colnames, function(row, colnames) {
row$indicator <- ifelse(any(row[, colnames] %in% some_vector), 1, 0)
return(row)
})
While this current implementation works, it takes centuries for the big df. In fact I think split() is a major bottleneck.
Thank you!
One option may be to make use of row/column indexing
rowind <- rep(seq_len(nrow(df)), lengths(list_of_colnames) * nrow(df))
df$indicator <- +(tapply(c(t(df[unlist(list_of_colnames)])) %in% some_vector,
rowind, FUN = any))
-output
> df
A B indicator
1 fish A 1
2 hello cat 1
data
df <- data.frame(A = c('fish', 'hello'), B = c('A', 'cat'))
You can avoid splitting your data frame into a list all together and instead apply your condition across the rows using rowwise and c_across from dplyr:
library(dplyr)
library(purrr)
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map(list_of_colnames, ~
df %>%
rowwise() %>%
mutate(indicator = as.numeric(any(c_across(all_of(.x)) %in% some_vector))) %>%
ungroup()
)
Output
Still mapping over list_of_columns returns a list output:
[[1]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird TRUE
3 bird lion cat FALSE
[[2]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird FALSE
3 bird lion cat FALSE
Data
structure(list(A = c("fish", "dog", "bird"), B = c("dog", "cat",
"lion"), C = c("bird", "bird", "cat")), class = "data.frame", row.names = c(NA,
-3L))

How to count list inside dataframe in R

I've been trying to figure this out for a few hours and I'm hoping someone can point me in the right direction. I'm trying to get from the below data set named "current data"
current_data <-
tribble(
~ID, ~grade_Q1, ~points_Q1,
"1", c("D-", "C-", "C-", "C-"), c(1, 2, 2, 2),
"2", c("A", "B", "B+", "B+"), c(4, 3, 3, 3),
)
to the below dataset named "updated_data"
updated_data <-
tribble(
~ID, ~grade_Q1, ~points_Q1, ~n_grades,
"1", "D- C C- C-", "1 2 2 2", 4,
"2", "A B B+ B+ A", "4 3 3 3 4", 5
)
The "n_grades" column is literally just a count of the number of letter grades in the "grade_q1" column. Anyone have any ideas how to proceed?
We can get the lengths of 'grade_Q1' to create the n_grades, then loop over the list columns with map, concatenate into a single string with str_c
library(dplyr)
library(stringr)
library(purrr)
current_data %>%
mutate(n_grades = lengths(grade_Q1),
grade_Q1 = map_chr(grade_Q1, str_c, collapse= ' '),
points_Q1 = map_chr(points_Q1, str_c, collapse = ' '))
-output
# A tibble: 2 x 4
# ID grade_Q1 points_Q1 n_grades
# <chr> <chr> <chr> <int>
#1 1 D- C- C- C- 1 2 2 2 4
#2 2 A B B+ B+ A 4 3 3 3 4 5
If there are many columns, it can be simplified with across
current_data %>%
mutate(n_grades = lengths(grade_Q1),
across(c(grade_Q1, points_Q1), ~ map_chr(., str_c, collapse= ' ')))
Or using base R
current_data$n_grades <- lengths(current_data$grade_Q1)
current_data[c("grade_Q1", "points_Q1")] <-
lapply(current_data[c("grade_Q1", "points_Q1")],
sapply, paste, collapse= ' ')
A data.table option
setDT(current_data)[
,
c(
lapply(.SD, function(x) paste0(unlist(x), collapse = " ")),
n_grades = lengths(grade_Q1)
),
ID
][]
gives
ID grade_Q1 points_Q1 n_grades
1: 1 D- C- C- C- 1 2 2 2 4
2: 2 A B B+ B+ A 4 3 3 3 4 5

Turning a text column into a vector in r

I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?
We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b
Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")
use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE
An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b

How to merge rows in a dataframe and combine factor-values in cells

I have a dataframe in R that in which I want to merge certain rows and combine the values of certain cells in these rows. Imagine the following data frame:
Col.1<-c("a","b","b","a","c","c","c","d")
Col.2<-c("mouse", "cat", "dog", "bird", "giraffe", "elephant", "zebra", "worm")
df<-data.frame(Col.1, Col.2)
df
Col.1 Col.2
a mouse
b cat
b dog
a bird
c giraffe
c elephant
c zebra
d worm
I would like to merge all adjacent rows in which the values in Col.1 are the same and combine the values in Col.2 accordingly.
The final result should look like this:
Col.1 Col.2
a mouse
b cat dog
a bird
c giraffe elephant zebra
d worm
I have tried to use a dplyr-solution (like:ddply(df, .(Col.1), summarize, Col.2 = sum(Col.2))), but the sum-command doesn't work for factor-values.
We can do a group by paste. To do the grouping for adjacent similar elements, rleid from data.table can be used, and then summarise the values of 'Col.2' by pasteing
library(dplyr)
library(data.table)
library(stringr)
df %>%
group_by(Col.1, grp = rleid(Col.1)) %>%
summarise(Col.2 = str_c(Col.2, collapse=' ')) %>%
ungroup %>%
select(-grp)
# A tibble: 5 x 2
# Col.1 Col.2
# <fct> <chr>
#1 a mouse
#2 a bird
#3 b cat dog
#4 c giraffe elephant zebra
#5 d worm
NOTE: This matches the output showed in the OP's post
EDIT: missed the "adjacent" bit. See the solution using base function rle below from this question.
Col.1 <- c("a","b","b","a","c","c","c","d")
Col.2 <- c("mouse", "cat", "dog", "bird", "giraffe", "elephant", "zebra", "worm")
df <- tibble(Col.1, Col.2)
rlel <- rle(df$Col.1)$length
df %>%
mutate(adj = unlist(lapply(1:length(rlel), function(i) rep(i, rlel[i])))) %>%
group_by(Col.1, adj) %>%
summarize(New.Col.2 = paste(Col.2, collapse = " ")) %>%
ungroup %>% arrange(adj) %>% select(-adj)
# A tibble: 5 x 2
Col.1 New.Col.2
<chr> <chr>
1 a mouse
2 b cat dog
3 a bird
4 c giraffe elephant zebra
5 d worm

Reordering columns in a large dataframe

Using the following example dataframe:
a <- c(1:5)
b <- c("Cat", "Dog", "Rabbit", "Cat", "Dog")
c <- c("Dog", "Rabbit", "Cat", "Dog", "Dog")
d <- c("Rabbit", "Cat", "Dog", "Dog", "Rabbit")
e <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
f <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
df <- data.frame(a,b,c,d,e,f)
I want to investigate how to reorder the columns WITHOUT having to type in all the column names, i.e., df[,c("a","d","e","f","b","c")]
How would I just say I want columns b and c AFTER column f? (only referencing the columns or range of columns that I want to move?).
Many thanks in advance for your help.
To move specific columns to the beginning or end of a data.frame, use select from the dplyr package and its everything() function. In this example we are sending to the end:
library(dplyr)
df %>%
select(-b, -c, everything())
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
Without the negation, the columns would be sent to the front.
If you're just moving certain columns to the end, you can create a little helper-function like the following:
movetolast <- function(data, move) {
data[c(setdiff(names(data), move), move)]
}
movetolast(df, c("b", "c"))
# a d e f b c
# 1 1 Rabbit Cat Cat Cat Dog
# 2 2 Cat Dog Dog Dog Rabbit
# 3 3 Dog Dog Dog Rabbit Cat
# 4 4 Dog Rabbit Rabbit Cat Dog
# 5 5 Rabbit Cat Cat Dog Dog
I would not recommend getting too into the habit of using column positions, especially not from a programmatic standpoint, since those positions might change.
"For fun" update
Here's an extended interpretation of the above function. It allows you to move columns to either the first or last position, or to be before or after another column.
moveMe <- function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = data[c(tomove, temp)],
last = data[c(temp, tomove)],
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)-1))]
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)))]
})
x
}
Try it with the following.
moveMe(df, c("b", "c"))
moveMe(df, c("b", "c"), "first")
moveMe(df, c("b", "c"), "before", "e")
moveMe(df, c("b", "c"), "after", "e")
You'll need to adapt it to have some error checking--for instance, if you try to move columns "b" and "c" to "before c", you'll (obviously) get an error.
You can refer to columns by position. e.g.
df <- df[ ,c(1,4:6,2:3)]
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
The package dplyr and the function dplyr::relocate, a new verb introduced in dplyr 1.0.0, does exactly what you are looking for with highly readable syntax.
df %>% dplyr::relocate(b, c, .after = f)
To generalize the reshuffling of columns in any order using dplyr, for example, to reshuffle:
df <- data.frame(a,b,c,d,e,f)
to
df[,c("a","d","e","f","b","c")]
df %>% select(a, d:f, b:c)
Use the subset function:
> df <- data.frame(a,b,c,d,e,f)
> df <- subset(df, select = c(a, d:f, b:c))
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
I changed the previous function to use it for data.table usinf the function setcolorder of the package data.table.
moveMeDataTable <-function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = setcolorder(data,c(tomove, temp)),
last = setcolorder(data,c(temp, tomove)),
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)-1))
setcolorder(data,order)
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)))
setcolorder(data,order)
})
x
}
DT <- data.table(A=sample(3, 10, TRUE),
B=sample(letters[1:3], 10, TRUE), C=sample(10))
DT <- moveMeDataTable(DT, "C", "after", "A")
Here is another option:
df <- cbind( df[, -(2:3)], df[, 2:3] )

Resources