I have an example dataframe as below.
pr_id
product
name
id_234
onion,bean
chris
id_34d
apple
tom
id_87t
plantain, potato, apple
tex
I want to access the product column and create a new column and assign 1 if apple is in the list and 0 if not.
So i expect a result like this:
pr_id
product
name
result
id_234
onion,bean
chris
0
id_34d
apple
tom
1
id_87t
plantain, potato, apple
tex
1
I thought of something like this:
my_df$result <- ifelse(my_df$product == 'apple', 1,0)
but this only work for rows 1 and 2, but not working for last row having multiple elements.
Please how do i go with this?
With dplyr, dataframe kindly taken from p. Paccioretti
Thanks to AnilGoyal for stringr::str_detect
# construct the dataframe
pr_id = c("id_234", "id_34d", "id_87t")
product = c("onion,bean",
"apple", "plantain, potato, apple")
name = c("chris", "tom","tex")
my_df <- data.frame(pr_id, product, name)
# check with case_when and str_detect if apple is in product
my_df <- my_df %>%
mutate(result = case_when(stringr::str_detect(product, "apple") ~ 1,
TRUE ~ 0)
)
You can use agrepl which searches for approximate matches within a string. If you use ==, you are searching for exact matching.
my_df <-
structure(
list(
pr_id = c("id_234", "id_34d", "id_87t"),
product = c("onion,bean",
"apple", "plantain, potato, apple"),
name = c("chris", "tom",
"tex")
),
class = "data.frame",
row.names = c(NA, -3L)
)
my_df$result <- ifelse(agrepl('apple', my_df$product), 1,0)
Or a tidyverse approach
library(dplyr)
my_df <-
my_df %>%
mutate(result = as.numeric(agrepl('apple', product)))
my_df
#> pr_id product name result
#> 1 id_234 onion,bean chris 0
#> 2 id_34d apple tom 1
#> 3 id_87t plantain, potato, apple tex 1
Using str_count
library(dplyr)
library(stringr)
df %>%
mutate(result = str_count(product, 'apple'))
I would use the str_detect option in stringr (tidyverse option).
my_df <- my_df %>%
mutate(result = ifelse(str_detect(product, "apple"), 1, 0))
Related
My dataframe looks like this:
V1
c("cheese","bread","sugar","cream","milk","butter")
c("milk","butter","apples","cream","bread")
c("butter","milk","toffee")
c("cream","milk","butter","sugar")
I am trying to count the number of times each element appears and sum in a new column. I would like to end up with something like this:
V2 V3
cheese 1
bread 2
sugar 2
cream 3
milk 4
butter 4
apples 1
toffee 1
I have tried using the following code
counts <- unlist(V1, use.names = FALSE)
counts <- table(counts)
But for some reason the counts are wrong and values are being skipped.
If I understand you correctly and your data is organized as provided below, then we could do it this way:
Using separate_rows will allow to bring all your strings in one row.
remove c and empty rows
Use fct_inorder from forcats package (it is in tidyverse) to keep the order as provided
then apply count with the name argument:
library(tidyverse)
df %>%
separate_rows(V1) %>%
filter(!(V1 == "c" | V1 == "")) %>%
mutate(V1 = fct_inorder(V1)) %>%
count(V1, name ="V3")
V1 V3
<fct> <int>
1 cheese 1
2 bread 2
3 sugar 2
4 cream 3
5 milk 4
6 butter 4
7 apples 1
8 toffee 1
df <- structure(list(V1 = c("c(\"cheese\",\"bread\",\"sugar\",\"cream\",\"milk\",\"butter\")",
"c(\"milk\",\"butter\",\"apples\",\"cream\",\"bread\")", "c(\"butter\",\"milk\",\"toffee\")",
"c(\"cream\",\"milk\",\"butter\",\"sugar\")")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
A couple of little issues with the question. Found it hard to reproduce exactly so took some liberties with the DF and present a couple of options that might help:
Option 1 - data in one column
library(tidyverse)
df <- data.frame(V1 = c("cheese","bread","sugar","cream","milk","butter",
"milk","butter","apples","cream","bread",
"butter","milk","toffee",
"cream","milk","butter","sugar"))
df <- df %>% dplyr::group_by(V1) %>%
summarise(
V3 = n()
)
Option 2 - data in columns - added NAs so it made a DF
library(tidyverse)
df <- data.frame(c("cheese","bread","sugar","cream","milk","butter"),
c("milk","butter","apples","cream","bread",NA),
c("butter","milk","toffee",NA,NA,NA),
c("cream","milk","butter","sugar",NA,NA))
df <- data.frame(V1=unlist(df)) %>%
select(V1) %>%
drop_na() %>%
group_by(V1) %>%
summarise(V3 = n())
hope this helps!
I try to find the most frequent category within every row of a dataframe. A category can consist of multiple words split by a /.
library(tidyverse)
library(DescTools)
# example data
id <- c(1, 2, 3, 4)
categories <- c("apple,shoes/socks,trousers/jeans,chocolate",
"apple,NA,apple,chocolate",
"shoes/socks,NA,NA,NA",
"apple,apple,chocolate,chocolate")
df <- data.frame(id, categories)
# the solution I would like to achieve
solution <- df %>%
mutate(winner = c("apple", "apple", "shoes/socks", "apple"),
winner_count = c(1, 2, 1, 2))
Based on these answers I have tried the following:
Write a function that finds the most common word in a string of text using R
trial <- df %>%
rowwise() %>%
mutate(winner = names(which.max(table(categories %>% str_split(",")))),
winner_count = which.max(table(categories %>% str_split(",")))[[1]])
Also tried to follow this approach, however it also does not give me the required results
How to find the most repeated word in a vector with R
trial2 <- df %>%
mutate(winner = DescTools::Mode(str_split(categories, ","), na.rm = T))
I am mainly struggling because my most frequent category is not just one word but something like "shoes/socks" and the fact that I also have NAs. I don't want the NAs to be the "winner".
I don't care too much about the ties right now. I already have a follow up process in place where I handle the cases that have winner_count = 2.
split the categories on comma in separate rows, count their occurrence for each id, drop the NA values and select the top occurring row for each id
library(dplyr)
library(tidyr)
df %>%
separate_rows(categories, sep = ',') %>%
count(id, categories, name = 'winner_count') %>%
filter(categories != 'NA') %>%
group_by(id) %>%
slice_max(winner_count, n = 1, with_ties = FALSE) %>%
ungroup %>%
rename(winner = categories) %>%
left_join(df, by = 'id') -> result
result
# id winner winner_count categories
# <dbl> <chr> <int> <chr>
#1 1 apple 1 apple,shoes/socks,trousers/jeans,chocolate
#2 2 apple 2 apple,NA,apple,chocolate
#3 3 shoes/socks 1 shoes/socks,NA,NA,NA
#4 4 apple 2 apple,apple,chocolate,chocolate
I have df that looks like this, and I would like to build a new variableMain if Math|ELA in Subject. The sample data and my codes are:
df<- structure(list(Subject = c("Math", "Math,ELA", "Math,ELA, PE",
"PE, Math", "ART,ELA", "PE,ART")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
df<-df %>%
+ mutate(Main=case_when (grepl("Math|ELA", Subject)~ paste0(str_extract_all(df$Subject, "Math|ELA"))))
However my outcome looks like following, not the one I like. What did I do wrong? I feel that my codes complicated the simple step. Any better solution?
str_extract_all returns a list. We need to loop over the list and paste/str_c
library(dplyr)
library(stringr)
library(purrr)
df %>%
mutate(Main = case_when(grepl("Math|ELA", Subject)~
map_chr(str_extract_all(Subject, "Math|ELA"), toString)))
-output
# A tibble: 6 x 2
# Subject Main
# <chr> <chr>
#1 Math Math
#2 Math,ELA Math, ELA
#3 Math,ELA, PE Math, ELA
#4 PE, Math Math
#5 ART,ELA ELA
#6 PE,ART <NA>
Or another option is separate_rows from tidyr
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_rows(Subject) %>%
group_by(rn) %>%
summarise(Main = toString(intersect(Subject, c("Math", "ELA"))),
.groups = 'drop') %>%
select(Main) %>%
bind_cols(df, .)
NOTE: paste by itself doesn't do anything and in a list, we need to loop over the list
Or another option is to use
trimws(gsub("(Math|ELA)(*SKIP)(*FAIL)|\\w+", "", df$Subject, perl = TRUE), whitespace = ",\\s*")
#[1] "Math" "Math,ELA" "Math,ELA" "Math" "ELA" ""
Here is a base R option using regmatches
transform(
df,
Main = sapply(
regmatches(Subject, gregexpr("Math|ELA", Subject)),
function(x) replace(toString(x), !length(x), NA)
)
)
which gives
Subject Main
1 Math Math
2 Math,ELA Math, ELA
3 Math,ELA, PE Math, ELA
4 PE, Math Math
5 ART,ELA ELA
6 PE,ART <NA>
I wish to search a dataframe (really, a categorized word list), and if the word is found, it returns the column name; if it is not found, it simply reproduces the word. The basic idea is below but I can't get it to work as expected:
#data frame to be searched
words <- data.frame(people=c("Mike", "Tom", "Molly", "Susan"),
dogs=c("Rex", "Fido", "King", "Roy"))
#data frame to work with
d <- data.frame(name=c("Roy","Tom", "Pat"))
d %>% mutate(
returned = ifelse(name %in% d, colnames(), name)
)
This returns:
name returned
1 Roy 2
2 Tom 3
3 Pat 1
However, it should return
name returned
1 Roy dog
2 Tom people
3 Pat Pat
I feel like my script is close, but not sure what to do to fix it.
Any help is appreciated!
The numbers in the 'returned' are due to the factor coercion to integer storage mode values. It can be avoided if we create a character class column with stringsAsFactors = FALSE while creating the data.frame or use as.character(name).
d <- data.frame(name=c("Roy","Tom", "Pat"), stringsAsFactors = FALSE)
words <- data.frame(people=c("Mike", "Tom", "Molly", "Susan"),
dogs=c("Rex", "Fido", "King", "Roy"), stringsAsFactors = FALSE)
In addition to the issue with factor, the OP's code in ifelse is not using the keyvalue dataset 'words' i.e. name %in% d refers to calling the value of column 'name' in the data.frame, and the second argument is colnames(), which would have resulted in 'error', but because the first expression is returning FALSE, it will check the 'no' values i.e' 'name'
d %>%
mutate(i1 = name %in% d)
# name i1
#1 Roy FALSE
#2 Tom FALSE
#3 Pat FALSE
Because the 'name' is factor, its values are coerced to integer mode and that is what is showed in the output
We can use pivot_longer to convert to 'long' format and then do a right_join
library(dplyr)
library(tidyr)
words %>%
pivot_longer(everything()) %>%
right_join(d, by = c('value' = 'name')) %>%
mutate(name = ifelse(is.na(name), value, name)) %>%
select(returned = name, name = value)
# returned name
#1 dogs Roy
#2 people Tom
#3 Pat Pat
Or we can use case_when without any reshaping
d %>%
mutate(returned = case_when(name %in% words$people ~ 'people',
name %in% words$dogs ~ 'dogs',
TRUE ~ as.character(name)))
# name returned
#1 Roy dogs
#2 Tom people
#3 Pat Pat
Or using only base R
d$returned <- with(stack(words), as.character(ind[match(d$name, values)]))
d$returned[is.na(d$returned)] <- d$name[is.na(d$returned)]
d
# name returned
#1 Roy dogs
#2 Tom people
#3 Pat Pat
We can get words data in long format and then do a left_join. For the returned values that do not match we can replace with name value.
library(dplyr)
d %>%
left_join(tidyr::pivot_longer(words, cols = names(words), names_to = 'returned'),
by = c('name' = 'value')) %>%
mutate(returned = coalesce(returned, name))
# name returned
#1 Roy dogs
#2 Tom people
#3 Pat Pat
I want to iterate over column names of the data frame, then using dplyr, separate fields using a delimiter(->) found among the row fields. This is how the dataset looks like :
dput(df)
structure(list(v1 = c("Silva->Mark", "Brandon->Livo", "Mango->Apple"),
v2 = c("Austin", "NA ", "Orange"),
v3 = c("James -> Jacy","NA->Jane", "apple -> Orange")),
class = "data.frame", row.names = c(NA, -3L))
Now I wrote a code that filters out column names with delimiter(->) on rows which are column v1 and column v3. Here is the code:
rows_true <- apply(df,2,function(x) any(sapply(x,function(y)grepl("->",y))))
ss<-df[,rows_true]
Then I tried to loop through those column names so that I can separate using the delimiter using this code but it ain't working
cols<- names(df)
if (names %in% df){
splitcols <- ss %>%
tidyr::separate(cols, into = c(paste0(names,+ "old"), "paste0(names,+ "New")"), sep = "->")
}
The reason I am using paste0 is because I do want the columns split into two using the delimiter then the newly formed columns should be named using the original name plus suffix Old for the first one and New for second split column
End result after looping through column names and recursively separating them should look like this
dput(df)
structure(list(v1_Old = c("Silva", "Brandon", "Mango"),
v1_New = c("Mark", "Livo", "Apple"),
v3_Old = c("James","NA", "apple"),
v3_New = c("Jacy","Jane", "Orange")),
class = "data.frame", row.names = c(NA, -3L))
For the sake of completeness, here is also a solution which uses data.table().
There are some differences to the other answers posted so far:
It is not required to identify the columns to be split beforehand. Instead, columns without "->" are dropped from the result on the fly.
The regular expression which is used for splitting includes surrounding white space (if any)
" *-> *". This avoids to call trimws() on the resulting pieces afterwards or to remove white space beforehand.
.
library(data.table)
library(magrittr) # piping used to improve readability
setDT(df)
lapply(names(df), function(x) {
mDT <- df[, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_Old", "_New")))
}) %>% as.data.table()
v1_Old v1_New v3_Old v3_New
1: Silva Mark James Jacy
2: Brandon Livo NA Jane
3: Mango Apple apple Orange
One possibility involving dplyr and tidyr could be:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid) %>%
mutate(val = trimws(val),
var = make.unique(var)) %>%
ungroup() %>%
spread(var, val) %>%
select(-rowid)
v1 v1.1 v3 v3.1
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange
Or to further match the expected output:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid, var) %>%
mutate(val = trimws(val),
var2 = if_else(row_number() == 2, paste0(var, "_old"), paste0(var, "_new"))) %>%
ungroup() %>%
select(-var) %>%
spread(var2, val) %>%
select(-rowid)
v1_new v1_old v3_new v3_old
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange
A different approach with dplyr, purr, and stringr is the following.
library(dplyr)
library(purrr)
library(stringr)
# Detect the columns with at least on "->"
my_df_cols <- map_lgl(my_df, ~ any(str_detect(., "->")))
my_df %>%
# Select only the columns with at least "->"
select(which(my_df_cols)) %>%
# Mutate these columns and only keep the mutated columns with new names
transmute_all(list(old = ~ str_split(., "->", simplify = TRUE)[, 1],
new = ~ str_split(., "->", simplify = TRUE)[, 2]))
# v1_old v3_old v1_new v3_new
# 1 Silva James Mark Jacy
# 2 Brandon NA Livo Jane
# 3 Mango apple Apple Orange
We can also use cSplit from splitstackshape
#Detect columns with "->"
cols <- names(df)[colSums(sapply(df, grepl, pattern = "->")) > 1]
#Remove unwanted whitespaces before and after "->"
df[cols] <- lapply(df[cols], function(x) gsub("\\s+", "", x))
#Split into new columns specifying sep as "->"
splitstackshape::cSplit(df[cols], cols, sep = "->")
# v1_1 v1_2 v3_1 v3_2
#1: Silva Mark James Jacy
#2: Brandon Livo <NA> Jane
#3: Mango Apple apple Orange