R group by substring

R group by substring - r

Sample data
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))
To explain. If 'name' contains "ue", then WANT = "ue" and if 'name' contains 're' then WANT = "re". Capitalization does not matter.
This is my attempt:
df$attempt <- NA
df$attempt[substr(df$name) == "ue"] <- "ue"
df$attempt[substr(df$name) == "re"] <- "re"

A solution using stringr (part of the tidyverse).
library(tidyverse)
data2 <- data %>%
mutate(attempt = str_extract(name, pattern = regex("ue|re", ignore_case = TRUE)),
attempt = str_to_lower(attempt))
data2
# id name WANT attempt
# 1 1 blue ue ue
# 2 2 green re re
# 3 3 red re re
# 4 4 read re re
# 5 5 HUE ue ue
DATA
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))

Here is a couple of versions
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"))
#base r version
data$want <- ifelse(grepl("ue", data$name, ignore.case = T), "ue",
ifelse(grepl("re", data$name, ignore.case = T), "re",
NA))
#tidyverse version
library(dplyr)
data <- data %>%
mutate(want = ifelse(grepl("ue", name, ignore.case = T), "ue",
ifelse(grepl("re", name, ignore.case = T), "re",
NA)))

Try using ifelse and mutate. grepl("ue",name,ignore.case = T) checks if ue or UE exists. Same logic applies to [re]
library(dplyr)
data = data%>%
mutate(Attempt = ifelse(grepl("ue",name,ignore.case = T),"ue",
ifelse(grepl("re",name,ignore.case = T),"re",NA)))

With purrr and dplyr:
library(dplyr)
library(purrr)
data %>%
mutate(group = map2_chr(WANT, name, ~ .x[grepl(.x, .y, ignore.case = TRUE)]))
Output:
id name WANT group
1 1 blue ue ue
2 2 green re re
3 3 red re re
4 4 read re re
5 5 HUE hu hu
Data:
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "hu"),
stringsAsFactors = FALSE)

Related

Convert multiple columns that are onehot encoded into a factor column

Some columns in the dataset are one hot encoded. I wish to convert them into one factor column.
I wish to write a code where I specify which columns to be combined and converted to factor column.
Below is an example with desired result.
library(tidyverse)
tbl <- tibble(
# one hot encoded
a1_blue = c(1, 0, 0),
a1_red = c(0, 1, 0),
a1_green = c(0, 0, 1),
# one hot encoded
a2_square = c(1, 0, 0),
a2_circle = c(0, 1, 0),
a2_dot = c(0, 0, 1),
a3_letters = factor(c("A", "B", "C"))
)
tbl_desired <- tibble(
a1_colors = factor(c("blue", "red", "green"),
levels = c("blue", "red", "green")),
a2_shapes = factor(c("square", "circle", "dot"),
levels = c("square", "circle", "dot")),
a3_letters = factor(c("A", "B", "C"))
)

This will give you the structure that you need. You can convert the columns to factors using mutate(across(everything(), as.factor)).
tbl %>%
pivot_longer(-a3_letters) %>%
filter(value != 0) %>%
separate(name, into = c("var", "val")) %>%
pivot_wider(-value, values_from = val, names_from = var)
#> # A tibble: 3 x 3
#> a3_letters a1 a2
#> <fct> <chr> <chr>
#> 1 A blue square
#> 2 B red circle
#> 3 C green dot

Selecting elements from a list with non compatible length

Given the following structure of the list:
x <- list(list(Main = list(one = list(tlv = 1, beta = 2), two = "three", three = 4,list_a = list(list(value_1 = "a1", value_2 = "b", c = "c")))),
list(Main = list(one = list(tlv = 2, beta = 6), two = "seven", three = 8,list_a = list(list(value_1 = "aa2", value_2 = "bb", c = "cc")))),
list(Main = list(one = list(tlv = 3),list_a = list(list(value_1 = c("aaa3", "aaaa4"), value_2 = c("bbb", "bbbb"), c = c("ccc", "ccc"))))))
I'm trying to create a dataframe with a structure like this:
tlv | value_1
1 | a1
2 | aa2
3 | aaa3
3 | aaaa4
so far I have to the following:
library(tidyverse)
tibble::tibble(
tlv = map(x, list(1,1,"tlv"), .default = NA) %>% unlist(),
value = map(x, list(1,"list_a", 1, "value"), .default = NA) %>% unlist())
Which leads to the following error:
Error: Tibble columns must have compatible sizes.
* Size 3: Existing data.
* Size 4: Column `value`.
i Only values of size one are recycled.
This makes sense given the structure of the list (3 values for one of the variables en 4 values for the other). But I don't see a solution to link the values to the parent element of the list. So that every 'value' also gets the corresponding 'tlv' value. Any guidance how to solve this problem?

Found a solution, this does the trick:
x %>%
map_df(~tibble(
tlv = .$Main$one$tlv,
value = .$Main$list_a[[1]]$value_1))

An alternative :
library(tidyverse)
value_1 <-
map_depth(x, 4, pluck, "value_1", .ragged = TRUE) %>%
map(unlist, use.names = FALSE)
tlv <-
map_depth(x, 3, pluck, "tlv") %>%
map_dbl(unlist, use.names = FALSE)
df <-
tibble(tlv = tlv, value_1 = value_1) %>%
unnest_auto(col = value_1)

Recoding values in a selection of columns of a dataframe using dplyr

I have a large dataset like the one in the next example. Columns with As in their headers have codes from 1 to 4, and columns with Bs from 1 to 3.
library(dplyr)
d <- data.frame(
ID = 1:10,
A = sample(x = 1:4, size = 10, replace = T),
AA = sample(x = 1:4, size = 10, replace = T),
B = sample(x = 1:3, size = 10, replace = T),
BB = sample(x = 1:3, size = 10, replace = T)
)
Is there a neat way to use pipes in dplyr to recode the values from columns with As in the headers and columns with Bs to the following strings?
As - from 1, 2, 3, 4 to Green, Yellow, Orange, Red respectively
Bs - from 1, 2, 3 to Green, Yellow, Red respectively
This is a simplified and friendlier version of the real dataset.

By using mutate_at from dplyr, it is possible to accomplish the recodification from numeric codes to strings. It is necessary to first coerce the columns we want to recode from numeric to character or, otherwise, there will be an error message.
library(dplyr)
d <- data.frame(
ID = 1:10,
A = sample(x = 1:4, size = 10, replace = T),
AA = sample(x = 1:4, size = 10, replace = T),
B = sample(x = 1:3, size = 10, replace = T),
BB = sample(x = 1:3, size = 10, replace = T))
d_recoded <- d %>% mutate_at(vars(-contains("ID")), funs(as.character)) %>%
mutate_at(vars(contains("A"), -contains("ID")), funs(case_when(. == 1 ~ "Green", . == 2 ~ "Yellow", . == 3 ~ "Orange", . == 4 ~ "Red"))) %>%
mutate_at(vars(contains("B"), -contains("ID")), funs(case_when(. == 1 ~ "Green", . == 2 ~ "Yellow", . == 3 ~ "Red")))

How to Convert a Data Frame to a List for Clickstream Analysis

I am a novice R user and new to the forum.
I have a data frame that I need to convert so that each row is a character vector.
I want to remove the 0's from the data frame so that each row can have varying lengths. So in essence each row is a separate character vector in a list.
Where I am at is the following:
mydf<-matrix(sample(0:1,12*5, replace = T),ncol =4)
colnames(mydf)<-letters[1:ncol(mydf)]
swapcol <-which(mydf == 1, arr.ind = T)
mydf[swapcol]<-colnames(mydf)[swapcol[,2]]
mydf
The code produces a data frame in which the column labels are values. I need the following output:
Desired List Result
the format appears to be what I need in order to read in data to the package clickstream.
Thanks

Try this solution:
library(tidyverse)
s <- sample(x = 0:1, size = 15 * 4, replace = TRUE)
mx <- matrix(data = s, nrow = 15, ncol = 4, byrow = TRUE,
dimnames = list(c(paste("User", 1:15, sep = " ")), c("V1", "V2", "V3", "V4")))
df2 <- mx %>% as.data.frame() %>% rownames_to_column() %>% as_tibble()
%>% mutate(
V1 = ifelse(test = V1 == 1, yes = "a", no = NA),
V2 = ifelse(test = V2 == 1, yes = "b", no = NA),
V3 = ifelse(test = V3 == 1, yes = "c", no = NA),
V4 = ifelse(test = V4 == 1, yes = "d", no = NA))
mx2 <- t(apply(X = df2, MARGIN = 1, FUN = function(x{return(c(x[!is.na(x)],
x[is.na(x)]))}))

This returns a list with the formart you are asking for:
list(
apply(mydf, 1, function(a_row) {
my_paste <- function(...){
paste(..., sep = ", ")
}
a_row <- Reduce(my_paste, a_row)
a_row <- gsub("0(, )*", "", a_row)
a_row <- gsub(", $", "", a_row)
})
)
This returns a list of length 1. Replacing list with as.list, returns a list of length 15.

Return nested list with nested level and value

I would like to visualize some deeply nested data using networkD3. I can't figure out how to get the data into the right format before sending to radialNetwork.
Here is some sample data:
level <- c(1, 2, 3, 4, 4, 3, 4, 4, 1, 2, 3)
value <- letters[1:11]
where level indicates the level of the nest, and value is the name of the node. By using these two vectors, I need to get the data into the following format:
my_list <- list(
name = "root",
children = list(
list(
name = value[1], ## a
children = list(list(
name = value[2], ## b
children = list(list(
name = value[3], ## c
children = list(
list(name = value[4]), ## d
list(name = value[5]) ## e
)
),
list(
name = value[6], ## f
children = list(
list(name = value[7]), ## g
list(name = value[8]) ## h
)
))
))
),
list(
name = value[9], ## i
children = list(list(
name = value[10], ## j
children = list(list(
name = value[11] ## k
))
))
)
)
)
Here is the deparsed object:
> dput(my_list)
# structure(list(name = "root",
# children = list(
# structure(list(
# name = "a",
# children = list(structure(
# list(name = "b",
# children = list(
# structure(list(
# name = "c", children = list(
# structure(list(name = "d"), .Names = "name"),
# structure(list(name = "e"), .Names = "name")
# )
# ), .Names = c("name",
# "children")), structure(list(
# name = "f", children = list(
# structure(list(name = "g"), .Names = "name"),
# structure(list(name = "h"), .Names = "name")
# )
# ), .Names = c("name",
# "children"))
# )), .Names = c("name", "children")
# ))
# ), .Names = c("name",
# "children")), structure(list(
# name = "i", children = list(structure(
# list(name = "j", children = list(structure(
# list(name = "k"), .Names = "name"
# ))), .Names = c("name",
# "children")
# ))
# ), .Names = c("name", "children"))
# )),
# .Names = c("name",
# "children"))
Then I can pass it to the final plotting function:
library(networkD3)
radialNetwork(List = my_list)
The output will look similar to this:
Question: How can I create the nested list?
Note: As pointed out by #zx8754, there is already a solution in this SO post, but that requires data.frame as input. Due to the inconsistency in my level, I don't see a simple way to transform it into a data.frame.

Using a data.table-style merge:
library(data.table)
dt = data.table(idx=1:length(value), level, parent=value)
dt = dt[dt[, .(i=idx, level=level-1, child=parent)], on=.(level, idx < i), mult='last']
dt[is.na(parent), parent:= 'root'][, c('idx','level'):= NULL]
> dt
# parent child
# 1: root a
# 2: a b
# 3: b c
# 4: c d
# 5: c e
# 6: b f
# 7: f g
# 8: f h
# 9: root i
# 10: i j
# 11: j k
Now we can use the solution from the other post:
x = maketreelist(as.data.frame(dt))
> identical(x, my_list)
# [1] TRUE

As a preface, your data is difficult to work with because critical information is encoded in the order of the values in level. I don't know how you get those values in that order, but consider that there may be a better way to structure that information in the first place, which would make the next task easier.
Here's a base-y way of converting your data into a data frame with 2 columns, parent and child, then passing that into data.tree functions that can easily convert to the JSON format you need... and then pass it on to radialNetwork...
level <- c(1, 2, 3, 4, 4, 3, 4, 4, 1, 2, 3)
value <- letters[1:11]
library(data.tree)
library(networkD3)
parent_idx <- sapply(1:length(level), function(n) rev(which(level[1:n] < level[n]))[1])
df <- data.frame(parent = value[parent_idx], child = value, stringsAsFactors = F)
df$parent[is.na(df$parent)] <- ""
list <- ToListExplicit(FromDataFrameNetwork(df), unname = T)
radialNetwork(list)
Here's a tidyverse way of achieving the same...
level <- c(1, 2, 3, 4, 4, 3, 4, 4, 1, 2, 3)
value <- letters[1:11]
library(tidyverse)
library(data.tree)
library(networkD3)
data.frame(level, value, stringsAsFactors = F) %>%
mutate(row = row_number()) %>%
mutate(level2 = level, value2 = value) %>%
spread(level2, value2) %>%
mutate(`0` = "") %>%
arrange(row) %>%
fill(-level, -value, -row) %>%
gather(parent_level, parent, -level, -value, -row) %>%
filter(parent_level == level - 1) %>%
arrange(row) %>%
select(parent, child = value) %>%
data.tree::FromDataFrameNetwork() %>%
data.tree::ToListExplicit(unname = TRUE) %>%
radialNetwork()
and for a bonus, the current dev version of networkD3 (v0.4.9000) has a new treeNetwork function that takes a data frame with nodeId and parentId columns/variables, which eliminates the need for the data.tree fucntions to convert to JSON, so something like this works...
level <- c(1, 2, 3, 4, 4, 3, 4, 4, 1, 2, 3)
value <- letters[1:11]
library(tidyverse)
library(networkD3)
data.frame(level, value, stringsAsFactors = F) %>%
mutate(row = row_number()) %>%
mutate(level2 = level, value2 = value) %>%
spread(level2, value2) %>%
mutate(`0` = "root") %>%
arrange(row) %>%
fill(-level, -value, -row) %>%
gather(parent_level, parent, -level, -value, -row) %>%
filter(parent_level == level - 1) %>%
arrange(row) %>%
select(nodeId = value, parentId = parent) %>%
rbind(data.frame(nodeId = "root", parentId = NA)) %>%
mutate(name = nodeId) %>%
treeNetwork(direction = "radial")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R group by substring - r

Try using ifelse and mutate. grepl("ue",name,ignore.case = T) checks if ue or UE exists. Same logic applies to [re] library(dplyr) data = data%>% mutate(Attempt = ifelse(grepl("ue",name,ignore.case = T),"ue", ifelse(grepl("re",name,ignore.case = T),"re",NA)))

Related

Convert multiple columns that are onehot encoded into a factor column

Selecting elements from a list with non compatible length

Recoding values in a selection of columns of a dataframe using dplyr

How to Convert a Data Frame to a List for Clickstream Analysis

Return nested list with nested level and value

Categories

Resources