Multiple Values in One Cell using R - r

Suppose, there are 2 data.frames, for instance:
dat1 <- read.table("[path_dat1]", header=TRUE, sep=",")
id name age
1 Jack 21
2 James 40
dat2 <- read.table("[path_dat2]", header=TRUE, sep=",")
id interests
1 football
1 basketball
1 soccer
2 pingpang ball
How do I join table 1 and table 2 into a data.frame like the one below?
id name age interests
1 1 Jack 21 (football, basketball, soccer)
2 2 James 40 (pingpang ball)
How can I join these using plyr in the simplest way?

I can't tell you how to solve this in plyr but can in base:
dat3 <- aggregate(interests~id, dat2, paste, collapse=",")
merge(dat1, dat3, "id")
EDIT: If you really want the parenthesis you could use:
ppaste <- function(x) paste0("(", gsub("^\\s+|\\s+$", "", paste(x, collapse = ",")), ")")
dat3 <- aggregate(interests~id, dat2, ppaste)
merge(dat1, dat3, "id")

Using Tyler's example:
dat1$interests <- ave(dat1$id, dat1$id,
FUN=function(x) paste(dat2[ dat2$id %in% x, "interests"], collapse=",") )
> dat1
id name age interests
1 1 Jack 21 football, basketball, soccer
2 2 James 40 pingpang ball

Related

How to find intersect elements of concatenated string?

# create sample df
basket_customer <- c("apple,orange,banana","apple,banana,orange","strawberry,blueberry")
basket_ideal<- c("orange,banana","orange,apple,banana","strawberry,watermelon")
customer_name <- c("john","adam","john")
visit_id <- c("1001","1001","1003")
df2 <- cbind.data.frame(basket_customer,basket_ideal,customer_name,visit_id)
df2$basket_ideal <- as.character(basket_ideal)
df2$basket_customer <- as.character(basket_customer)
The goal is to compare the basket elements (fruits) of each customer to the ideal basket and return the missing fruit.
Note the same visit_id can exists for 1 or more users so the uniqueID is (id+username) and elements are not alphabetically sorted.
expected output:
visit_id
customer_name
NOT_in_basket_ideal
NOT_in_basket_customer
1001
john
apple
NA
1001
adam
NA
NA
1003
john
blueberry
watermelon
I tried using row_wise(),intersect(),except(),and unnesting however did not succeed. Thank you
We could use Map to loop over the corresponding elements of the list columns, and use setdiff to get the elements of the first vector not in the second
cst_list <- strsplit(df2$basket_customer, ",\\s*")
idl_list <- strsplit(df2$basket_ideal, ",\\s*")
lst1 <- Map(function(x, y) if(identical(x, y)) 'equal'
else setdiff(x, y), cst_list, idl_list)
lst1[lengths(lst1) == 0] <- NA_character_
v1 <- sapply(lst1, toString)
and the second case, just reverse the order
lst2 <- Map(function(x, y) if(identical(x, y)) 'equal'
else setdiff(y, x), cst_list, idl_list)
lst2[lengths(lst2) == 0] <- NA_character_
v2 <- sapply(lst2, toString)
Combining the output from both to 'df2'
df2[c("NOT_in_basket_ideal", "NOT_in_basket_customer")] <- list(v1, v2)
-output
df2[-(1:2)]
# customer_name visit_id NOT_in_basket_ideal NOT_in_basket_customer
#1 john 1001 apple NA
#2 adam 1001 NA NA
#3 john 1003 blueberry watermelon
Or in tidyverse
library(dplyr)
library(purrr)
library(stringr)
df2 %>%
mutate(across(starts_with('basket'), ~ str_extract_all(., "\\w+"))) %>%
transmute(customer_name, visit_id,
NOT_in_basket_ideal = map2_chr(basket_customer,
basket_ideal, ~ toString(setdiff(.x, .y))),
NOT_in_basket_customer = map2_chr(basket_ideal, basket_customer,
~ toString(setdiff(.x, .y))))
# customer_name visit_id NOT_in_basket_ideal NOT_in_basket_customer
#1 john 1001 apple
#2 adam 1001
#3 john 1003 blueberry watermelon

R - Reshaping repeated row value into column

I have data like this:
Name Rating
Tom 3
Tom 4
Tom 2
Johnson 5
Johnson 7
But I'd like it so each unique name is instead a column, with the ratings below, in each row. How can I approach this?
Here is a good way of doing it
x <- data.frame(c("Tom", "Tom", "Tom", "Johnson", "Johnson"), c(3,4,2,5,7))
colnames(x) <- c("Name", "Rating")
n <- unique(x[,1])
m <- max(table(x[,1]))
c <- data.frame(matrix(, ncol = length(n), nrow = m))
for (i in 1:length(n)) {
l <- x[which(x[,1] == n[i]), 2]
l2 <- rep("", m - length(l))
c[,i] <- c(l, l2)
}
colnames(c) <- n
Results:
Tom Johnson
1 3 5
2 4 7
3 2
Here is a way using CRAN package reshape.
library(reshape2)
d <- dcast(mydata, Rating ~ Name, value.var = "Rating")[-1]
d
# Johnson Tom
#1 NA 2
#2 NA 3
#3 NA 4
#4 5 NA
#5 7 NA
As you can see, there are too many NA values in this result. One way of getting rid of them could be:
d <- lapply(d, function(x) x[!is.na(x)])
n <- max(sapply(d, length))
d <- do.call(cbind.data.frame, lapply(d, function(x) c(x, rep(NA, n - length(x)))))
d
# Johnson Tom
#1 5 2
#2 7 3
#3 NA 4
Well, this does the job but introduces some NAs.
Edit: Replace the NAs with some other Rating.
mydata<-data.frame(Name=c("Tom","Tom","Tom","Johnson","Johnson"),Rating=c(3,4,2,5,7))
library(reshape2)
library(tidyverse)
mydata1<-mydata %>%
mutate(Name=as.factor(Name)) %>%
melt(id.var="Name") %>%
dcast(variable+value~Name) %>%
select(-value) %>%
rename(Name=variable) %>%
select_if(is.numeric)
mydata1 %>%
mutate(Johnson=as.factor(Johnson),Tom=as.factor(Tom)) %>%
mutate(Johnson=fct_explicit_na(Johnson,na_level = "No Rating"),
Tom=fct_explicit_na(Tom,na_level = "No Rating"))
Johnson Tom
1 No Rating 2
2 No Rating 3
3 No Rating 4
4 5 No Rating
5 7 No Rating

Put the combinations matrix of many rows in a column of a dataframe, then split it

I have a dataframe that looks like this (I simplify):
df <- data.frame(rbind(c(1, "dog", "cat", "rabbit"), c(2, "apple", "peach", "cucumber")))
colnames(df) <- c("ID", "V1", "V2", "V3")
## ID V1 V2 V3
## 1 1 dog cat rabbit
## 2 2 apple peach cucumber
I would like to create a column containing all possible combinations of variables V1:V3 two by two (order doesn't matter), but keeping a link with the original ID. So something like this.
## ID bigrams
## 1 1 dog cat
## 2 1 cat rabbit
## 3 1 dog rabbit
## 4 2 apple peach
## 5 2 apple cucumber
## 6 2 peach cucumber
My idea: use combn(), mutate() and separate_row().
library(tidyr)
library(dplyr)
df %>%
mutate(bigrams=paste(unlist(t(combn(df[,2:4],2))), collapse="-")) %>%
separate_rows(bigrams, sep="-") %>%
select(ID,bigrams)
The result is not what I expected... I guess that concatenating a matrix (the result of combine()) is not as easy as that.
I have two questions about this: 1) how to debug this code? 2) Is this a good way to do this kind of thing? I'm new on R but I’ve an Open Refine background, so concatenate-split multivalued cells make a lot of sense for me. But is this also the right method with R?
Thanks in advance for any help.
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), melt it to 'long' format, grouped by 'ID', get the combn of 'value' and paste it together
library(data.table)
dM <- melt(setDT(df), id.var = "ID")[, combn(value, 2, FUN = paste, collapse=' '), ID]
setnames(dM, 2, 'bigrams')[]
# ID bigrams
#1: 1 dog cat
#2: 1 dog rabbit
#3: 1 cat rabbit
#4: 2 apple peach
#5: 2 apple cucumber
#6: 2 peach cucumber
I recommend #akrun's "melt first" approach, but just for fun, here are more ways to do it:
library(tidyverse)
df %>%
mutate_all(as.character) %>%
transmute(ID = ID, bigrams = pmap(
list(V1, V2, V3),
function(a, b, c) combn(c(a, b, c), 2, paste, collapse = " ")
))
# ID bigrams
# 1 1 dog cat, dog rabbit, cat rabbit
# 2 2 apple peach, apple cucumber, peach cucumber
(mutate_all(as.character) just because you gave us factors, and factor to character conversion can be surprising).
df %>%
mutate_all(as.character) %>%
nest(-ID) %>%
mutate(bigrams = map(data, combn, 2, paste, collapse = " ")) %>%
unnest(data) %>%
as.data.frame()
# ID bigrams V1 V2 V3
# 1 1 dog cat, dog rabbit, cat rabbit dog cat rabbit
# 2 2 apple peach, apple cucumber, peach cucumber apple peach cucumber
(as.data.frame() just for a prettier printing)

R function that merge rows and introduce a new merge variable

I have a data set like this....
ID Brand
--- --------
1 Cokacola
2 Pepsi
3 merge with 1
4 merge with 2
5 merge with 1
6 Fanta
And I want to write a R function which merge the rows and introduce new variable according to ID just like following...
ID Brand merge
---- -------- --------
1 Cokacola 1,3,5
2 Pepsi 2,4
6 Fanta 6
Your data:
dat <- data.frame(
id = 1:6,
brand = c('Cokacola', 'Pepsi', 'merge with 1', 'merge with 2', 'merge with 1', 'Fanta'))
Inelegant-but-functional code:
repeats <- grepl('^merge with', dat$brand)
groups <- ifelse(repeats, gsub('merge with ', '', dat$brand), dat$id)
merge <- sapply(unique(groups), function(x) paste(dat$id[groups==x], collapse=','))
dat <- dat[!repeats,]
dat$merge <- merge
dat
## id brand merge
## 1 1 Cokacola 1,3,5
## 2 2 Pepsi 2,4
## 6 6 Fanta 6
There are most certainly ways to make this more elegant, depending on the consistency and makeup of the data.
You could try
library(reshape2)
indx <- !grepl('merge', df$Brand)
df1 <- df[indx,]
val <- as.numeric(sub('[^0-9]+', '', df[!indx, 'Brand']))
ml <- melt(tapply(which(!indx), val, FUN=toString))
df2 <- merge(df1, ml, by.x='ID', by.y='Var1', all=TRUE)
df2$merge <- with(df2, ifelse(!is.na(value),
paste(ID, value, sep=', '), ID))
df2[-3]
# ID Brand merge
#1 1 Cokacola 1, 3, 5
#2 2 Pepsi 2, 4
#3 6 Fanta 6

splitting a column delimiter R

I have a dataframe as below. I want to split the last column into 2. Splitting needs to be done based upon the only first : and rest of the columns dont matter.
In the new dataframe, there will be 4 columns. 3 rd column will be (a,b,d) while 4th column will be (1,2:3,3:4:4)
any suggestions? 4th line of my code doesnt work :(. I am okay with completely new solution or corrections to the line 4
employee <- c('John Doe','Peter Gynn','Jolie Hope')
salary <- c(3, 2, 1)
df <- data.frame(employee, salary, originalColumn = c("a :1", "b :2:3", "d: 3:4:4"))
as.data.frame(do.call(rbind, strsplit(df,":")))
--------------------update1
Below solutions work well. But i need a modified solution as I just realized that some of the cells in column 3 wont have ":". In such case i want text in that cell to appear in only 1st column after splitting that column
employee <- c('John Doe','Peter Gynn','Jolie Hope')
salary <- c(3, 2, 1)
df <- data.frame(employee, salary, originalColumn = c("a :1", "b", "d: 3:4:4"))
You could use cSplit. On your updated data frame,
library(splitstackshape)
cSplit(df, "originalColumn", sep = ":{1}")
# employee salary originalColumn_1 originalColumn_2
# 1: John Doe 3 a 1
# 2: Peter Gynn 2 b NA
# 3: Jolie Hope 1 d 3:4:4
And on your original data frame,
df1 <- data.frame(employee, salary,
originalColumn = c("a :1", "b :2:3", "d: 3:4:4"))
cSplit(df1, "originalColumn", sep = ":{1}")
# employee salary originalColumn_1 originalColumn_2
# 1: John Doe 3 a 1
# 2: Peter Gynn 2 b 2:3
# 3: Jolie Hope 1 d 3:4:4
Note: I'm using splitstackshape version 1.4.2. I believe the sep argument has been changed from version 1.4.0
You could use extract from tidyr to split the originalColumn in to two columns. In the below code, I am creating 3 columns and removing one of the unwanted columns from the result.
library(tidyr)
pat <- "([^ :])( ?:|: ?|)(.*)"
extract(df, originalColumn, c("Col1", "ColN", "Col2"), pat)[,-4]
# employee salary Col1 Col2
#1 John Doe 3 a 1
#2 Peter Gynn 2 b 2:3
#3 Jolie Hope 1 d 3:4:4
Using the updated df, (for better identification - df1)
extract(df1, originalColumn, c("Col1", "ColN", "Col2"), pat)[,-4]
# employee salary Col1 Col2
#1 John Doe 3 a 1
#2 Peter Gynn 2 b
#3 Jolie Hope 1 d 3:4:4
Or without creating a new column in df
extract(df, originalColumn, c("Col1", "Col2"), "(.)[ :](.*)") %>%
mutate(Col2= gsub("^\\:", "", Col2))
# employee salary Col1 Col2
#1 John Doe 3 a 1
#2 Peter Gynn 2 b 2:3
#3 Jolie Hope 1 d 3:4:4
Based on the pattern in df, the below code also works. Here, the regex used to extract the first column is (.). A dot is a single element at the beginning of the string inside the parentheses will be extracted for the Col1. Then .{2} two elements following the first are discarded and the rest within the parentheses (.*) forms the Col2.
extract(df, originalColumn, c("Col1", "Col2"), "(.).{2}(.*)")
# employee salary Col1 Col2
#1 John Doe 3 a 1
#2 Peter Gynn 2 b 2:3
#3 Jolie Hope 1 d 3:4:4
or using strsplit
as.data.frame(do.call(rbind, strsplit(as.character(df$originalColumn), " :|: ")))
# V1 V2
#1 a 1
#2 b 2:3
#3 d 3:4:4
For df1, here is a solution using strsplit
lst <- strsplit(as.character(df1$originalColumn), " :|: ")
as.data.frame(do.call(rbind,lapply(lst,
`length<-`, max(sapply(lst, length)))) )
# V1 V2
#1 a 1
#2 b <NA>
#3 d 3:4:4
You were close, here's a solution:
library(stringr)
df[, c('Col1','Col2')] <- do.call(rbind, str_split_fixed(df$originalColumn,":",n=2))
df$originalColumn <- NULL
employee salary Col1 Col2
1 John Doe 3 a 1
2 Peter Gynn 2 b 2:3
3 Jolie Hope 1 d 3:4:4
Notes:
stringr::str_split() is better than base::strsplit() because you don't have to do as.character(), also it has the n=2 argument you want to limit to only split on the first ':'

Resources