check a data.frame column (character) against 4 lists

check a data.frame column (character) against 4 lists - r

I want to check a word (in a column in a data-frame) against 4 lists (a, b, c, d):
if df$word is in a then df$code <- 1
if df$word is in b then df$code <- 2
if df$word is in c then df$code <- 3
if df$word is in d then df$code <- 4
if df$word is in a & b then df$code <- 1 2
if df$word is in a & c then df$code <- 1 3
if df$word is in a & d then df$code <- 1 4
if df$word is in b & c then df$code <- 2 3
if df$word is in b & d then df$code <- 2 4
if df$word is in c & d then df$code <- 3 4
etc.
What is the most efficient way to do so?
Example
df <- data.frame(word = c("book", "worm", "digital", "context"))
a <- c("book", "context")
b <- c("book", "worm", "context")
c <- c("digital", "worm", "context")
d <- c("context")
Expected output:
book 1 2
worm 2 3
digital 3
context 1 2 3 4

We can use a double sapply loop where for every element in the data frame we check in which list element it is present and get the corresponding list number.
lst <- list(a, b, c, d)
df$output <- sapply(df$V1, function(x) paste0(which(sapply(lst,
function(y) any(grepl(x,y)))), collapse = ","))
df
# V1 output
#1 book 1,2
#2 worm 2,3
#3 digital 3
#4 context 1,2,3,4
data
df <- read.table(text = "book
worm
digital
context")

Try this:
df <- data.frame(x =c("book", "worm","digital", "context"))
a <- c("book", "context")
b<- c("book", "worm", "context")
c <- c("digital", "worm", "context")
d <- c("context")
anno <- function(x){
rslt = ""
if (x %in% a) rslt =paste0(rslt," 1")
if (x %in% b) rslt =paste0(rslt," 2")
if (x %in% c) rslt =paste0(rslt," 3")
if (x %in% d) rslt =paste0(rslt," 4")
return(stringr::str_trim(rslt))
}
df$code <- sapply(df$x, anno)
df
#> x code
#> 1 book 1 2
#> 2 worm 2 3
#> 3 digital 3
#> 4 context 1 2 3 4
Created on 2018-08-17 by the reprex package (v0.2.0.9000).

This can also be accomplished in two steps:
Combine the four lists and reshape into long format
Aggregate while joing with df
using data.table:
library(data.table)
long <-setDT(melt(list(a, b, c, d), value.name = "word"))
long[setDT(df), on = "word", by = .EACHI, .(code = toString(L1))][]
word code
1: book 1, 2
2: worm 2, 3
3: digital 3
4: context 1, 2, 3, 4

Related

R - Reshaping repeated row value into column

I have data like this:
Name Rating
Tom 3
Tom 4
Tom 2
Johnson 5
Johnson 7
But I'd like it so each unique name is instead a column, with the ratings below, in each row. How can I approach this?

Here is a good way of doing it
x <- data.frame(c("Tom", "Tom", "Tom", "Johnson", "Johnson"), c(3,4,2,5,7))
colnames(x) <- c("Name", "Rating")
n <- unique(x[,1])
m <- max(table(x[,1]))
c <- data.frame(matrix(, ncol = length(n), nrow = m))
for (i in 1:length(n)) {
l <- x[which(x[,1] == n[i]), 2]
l2 <- rep("", m - length(l))
c[,i] <- c(l, l2)
}
colnames(c) <- n
Results:
Tom Johnson
1 3 5
2 4 7
3 2

Here is a way using CRAN package reshape.
library(reshape2)
d <- dcast(mydata, Rating ~ Name, value.var = "Rating")[-1]
d
# Johnson Tom
#1 NA 2
#2 NA 3
#3 NA 4
#4 5 NA
#5 7 NA
As you can see, there are too many NA values in this result. One way of getting rid of them could be:
d <- lapply(d, function(x) x[!is.na(x)])
n <- max(sapply(d, length))
d <- do.call(cbind.data.frame, lapply(d, function(x) c(x, rep(NA, n - length(x)))))
d
# Johnson Tom
#1 5 2
#2 7 3
#3 NA 4

Well, this does the job but introduces some NAs.
Edit: Replace the NAs with some other Rating.
mydata<-data.frame(Name=c("Tom","Tom","Tom","Johnson","Johnson"),Rating=c(3,4,2,5,7))
library(reshape2)
library(tidyverse)
mydata1<-mydata %>%
mutate(Name=as.factor(Name)) %>%
melt(id.var="Name") %>%
dcast(variable+value~Name) %>%
select(-value) %>%
rename(Name=variable) %>%
select_if(is.numeric)
mydata1 %>%
mutate(Johnson=as.factor(Johnson),Tom=as.factor(Tom)) %>%
mutate(Johnson=fct_explicit_na(Johnson,na_level = "No Rating"),
Tom=fct_explicit_na(Tom,na_level = "No Rating"))
Johnson Tom
1 No Rating 2
2 No Rating 3
3 No Rating 4
4 5 No Rating
5 7 No Rating

How to split my columns using a unique and tidyR

I'm working on a data.table with a column like this:
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
I would like to separate this column into 3 columns like this:
seg1 seg2 seg3
1 a b c
2 a b <NA>
3 d a b
4 f <NA> <NA>
The thing here is that when i split each row by ";" i need to keep unique of the row.

Here's a tidyverse approach. We split the character in A, keep only the unique values, paste the result back together and separate into three columns:
library(tidyverse)
df %>%
mutate(A = map(strsplit(as.character(A), ";"),
.f = ~ paste(unique(.x), collapse = ";"))) %>%
separate(A, into = c("seg1", "seg2", "seg3"))
Which gives:
# seg1 seg2 seg3
#1 a b c
#2 a b <NA>
#3 d a b
#4 f <NA> <NA>

library(stringr)
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
df <- str_split_fixed(df$A, ";", 3)
df <- apply(X = df,
FUN = function(x){
return(x[!duplicated(x)][1:ncol(df)])
},
MARGIN = 1)
df <- t(df)
df <- as.data.frame(df)
names(df) <- c("seg1", "seg2", "seg3")
df
# seg1 seg2 seg3
# 1 a b c
# 2 a b <NA>
# 3 d a b
# 4 f <NA> <NA>

How to keep track of duplicated rows in a data frame when do a reduction by unique(df)?

This is a followup question of this question.
Imagine the following data frame:
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
df <-data.frame(a,b)
which gives
a b
1 A 1
2 A 1
3 A 2
4 B 4
5 B 1
6 B 1
7 A 2
8 A 2
I reduce it to it's unique rows by:
df_unique <- unique(df)
Now, I am wondering how can I keep track of the merged rows. I would like to create a new column in which each component has a list of row names that have been merged. Something like the following:
df_unique_informative =
a b track
1 A 1 [1,2]
3 A 2 [3,7,8]
4 B 4 [4]
5 B 1 [5,6]

res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) x)
# OR perhaps you want
#res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x)
# paste(x, collapse = ", "))
res
# a b track
#1 A 1 1, 2
#2 B 1 5, 6
#3 A 2 3, 7, 8
#4 B 4 4
#Shorter code
res = aggregate(list(track = 1:NROW(df)), df[,1:2], '[')
Update
a <- c(rep("A", 3), rep("B", 3), rep("A",2))
b <- c(1,1,2,4,1,1,2,2)
c = letters[1:8]
df <-data.frame(a,b,c, stringsAsFactors = FALSE)
res = aggregate(x = list(track = 1:NROW(df)), by = list(a = df$a, b = df$b), function(x) df$c[x])
res
# a b track
#1 A 1 a, b
#2 B 1 e, f
#3 A 2 c, g, h
#4 B 4 d

Here is one option with tidyverse
library(tidyverse)
rownames_to_column(df, 'rn') %>%
group_by(a, b) %>%
summarise(track = list(rn))

parent child structure in R dataframe

I have a csv that contains an org structure as follows plus some additional columns. I use R to create charts and it works great !.
The challenge is when trying to create the charts for a subset manager and its children/grandchildren.
Is there any filtering that is possible in dplr or any alternative package?
Sample format:
emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3
The subset I need is for manager "C"
Scenario 1:emp_id==C should contain all nodes of 'D','D1','D2','E','E1','F'
expected structure:
manager,all_children
C D
C D1
C D2
C E
C E1
C F
Scenario 2:emp_id==C should contain all above nodes but retain mgr_id structure for 'D','E'
expected structure:
manager,all_children
C D
C E
C F
D D1
D D2
E E1

Consider the base package with by which creates a df list for every level of mgr_id (not just C):
SCENARIO 1
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("mgr_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub("*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 C D1
# 5 C D2
# 6 C E1
SCENARIO 2 (where the selected columns change in grandchild and then first column rename)
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("emp_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub(".*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
names(grandchild)[1] <- "mgr_id"
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 D D1
# 5 D D2
# 6 E E1

Here is one solution using functions from dplyr and data.table. dt3 is the output for scenario 1, while dt4 is the output for scenario 2.
# Load packages
library(dplyr)
library(data.table)
# Create example data frame
dt <- read.table(text = "emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3",
header = TRUE, stringsAsFactors = FALSE)
# Process the data
dt2 <- dt %>%
# Filter levels lower than 1
filter(nest_id > 1) %>%
mutate(group_id = ifelse(nest_id > 2, 0, 1)) %>%
# Create "run_id", which will be used to fill manager label
mutate(run_id = rleid(group_id)) %>%
mutate(run_id = ifelse(run_id %% 2 == 0, run_id - 1, run_id)) %>%
group_by(run_id) %>%
mutate(manager = first(emp_id)) %>%
# Select for manager C
filter(manager %in% "C") %>%
ungroup() %>%
# Remove rows if manager == emp_id
filter(manager != emp_id) %>%
rename(all_children = emp_id)
# Scenario 1
dt3 <- dt2 %>% select(manager, all_children)
# Scenario 2
dt4 <- dt2 %>%
select(manager = mgr_id, all_children) %>%
arrange(manager, all_children)

Concatenating all rows within a group using dplyr

Suppose I have a dataframe like this:
hand_id card_id card_name card_class
A 1 p alpha
A 2 q beta
A 3 r theta
B 2 q beta
B 3 r theta
B 4 s gamma
C 1 p alpha
C 2 q beta
I would like to concatenate the card_id, card_name, and card_class into one single row per hand level A, B, C. So the result would look something like this:
hand_id combo_1 combo_2 combo_3
A 1-2-3 p-q-r alpha-beta-theta
B 2-3-4 q-r-s beta-theta-gamma
....
I attempted to do this using group_by and mutate, but I can't seem to get it to work
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
mutate(combo_1 = paste(card_id),
combo_2 = paste(card_name),
combo_3 = paste(card_class))
Thank you for your help.

You were kind of close!
library(tidyr)
library(dplyr)
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
summarise(combo_1 = paste(card_id, collapse = "-"),
combo_2 = paste(card_name, collapse = "-"),
combo_3 = paste(card_class, collapse = "-"))
or using summarise_each:
byHand <- group_by(data, hand_id) %>%
summarise_each(funs(paste(., collapse = "-")))

Here is another option using data.table
library(data.table)
setDT(data)[, lapply(.SD, paste, collapse="-") , by = hand_id]
# hand_id card_id card_name card_class
#1: A 1-2-3 p-q-r alpha-beta-theta
#2: B 2-3-4 q-r-s beta-theta-gamma
#3: C 1-2 p-q alpha-beta

Not very familiar with dplyr... so here's my attempt without dplyr
df <- read_csv('data.csv')
res <- lapply(split(df, df$hand_id),function(x){
sL <- apply(x[,-1], 2, function(y) paste(y, collapse = "-"))
d <- data.frame(x$hand_id[1], rbind(sL))
names(d) <- c("hand_id", "combo_1", "combo_2", "combo_3")
return(d)
})
res <- do.call("rbind",res)
rownames(res) <- NULL
Here's the output:
## hand_id combo_1 combo_2 combo_3
## 1 A 1-2-3 p-q-r alpha-beta-theta
## 2 B 2-3-4 q-r-s beta-theta-gamma
## 3 C 1-2 p-q alpha-beta

If you have NAs in your data, you can use na.omit() inline with str_c(). unique() will also work if you only want the distinct.
data:
hand_id card_id card_name card_class
<chr> <dbl> <chr> <chr>
1 A 1 p alpha
2 A 2 q beta
3 A 3 r theta
4 A NA NA NA
5 B 2 q beta
6 B 3 r theta
7 B 4 s gamma
8 C 1 p alpha
9 C 2 q beta
code:
data %>%
group_by(hand_id) %>%
summarize(card_id = str_c(na.omit(card_id), collapse = "-"),
card_name = str_c(na.omit(card_name), collapse = "-"),
card_class = str_c(na.omit(card_class), collapse = "-"))
output:
hand_id card_id card_name card_class
* <chr> <chr> <chr> <chr>
1 A 1-2-3 p-q-r alpha-beta-the…
2 B 2-3-4 q-r-s beta-theta-gam…
3 C 1-2 p-q alpha-beta

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

check a data.frame column (character) against 4 lists - r

Related

R - Reshaping repeated row value into column

How to split my columns using a unique and tidyR

How to keep track of duplicated rows in a data frame when do a reduction by unique(df)?

parent child structure in R dataframe

Concatenating all rows within a group using dplyr

Categories

Resources