Interactive join in r based on different variables - r

I have two data frames as follows:
df<-data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2<-data.frame(
identifer=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productFF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
I want to join these two datasets first on the following variables + create a new column called "match" that describes the join:
1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"
Then, I want to join the REMAINING rows on these variables:
2)identifier, key, product, DF
match="identifier,key, product,DF"
Then the remaining rows from step 2 on these variables, so and so forth.
3) identifier, key, Ingredient, DF
4) identifier, key, DF
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient
9) identifier, key
And I want to return the rows that do not have a match as well. I know how to do this stepwise but I'm wondering if there is an easier way to do this?
this is the expected output:
df_out<-data.frame(
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product_1=c("productA","productB","productC","productD","productE","productF"),
ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))

Here is an option using data.table:
library(data.table)
setDT(df)
setDT(df_2)
keyord <- list(
c("product", "ingredient", "DF"),
c("product", "DF"),
c("ingredient", "DF"),
"DF",
c("product", "ingredient"),
"product",
"ingredient",
c()
)
cols <- c("product", "ingredient", "DF", "Route")
df[, Match := NA_character_]
for (v in keyord) {
k <- c("identifier", "key", v)
df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))]
df[is.na(Match) & check, Match := toString(k)]
}
setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE)
output:
id identifier key product_1 ingredient_1 DF_1 Match product_2 ingredient_2 DF_2 Route_2 check
1: 1-1 1 A productA ingredientA Tablet identifier, key, product, ingredient, DF productA ingredientA Tablet ORAL TRUE
2: 2-2 2 B productB ingredientB Powder identifier, key, product, ingredient, DF productB ingredientB Powder INHALATION TRUE
3: 3-3 3 C productC ingredientC Suspension identifier, key, ingredient, DF productCC ingredientC Suspension topical TRUE
4: 4-4 4 D productD ingredientD System identifier, key productDD ingredientDD injection injecatable TRUE
5: 5-5 5 E productE ingredientE Capsule <NA> <NA> <NA> <NA> <NA> NA
6: 6-6 6 F productF ingredientF Capsule identifier, key, product, ingredient productF ingredientF tablet oral TRUE
data after fixing some typos in OP:
df <- data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifier=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2 <- data.frame(
identifier=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
edit for multiple matches:
df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") )
setDT(df_2)
df[, c("Match", "check") := .(NA_character_, FALSE)]
ocols <- unique(unlist(keyord))
rbindlist(lapply(keyord, function(v) {
k <- c("identifier", "key", v)
a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id),
setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")),
setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2")))
]
df[id %chin% a$id, check := TRUE]
a
}), use.names=TRUE)
output:
id product_1 ingredient_1 DF_1 product_2 ingredient_2 DF_2 Route_2
1: 1-1 productA ingredientA Tablet productA ingredientA Tablet ORAL
2: 2-2 productB ingredientB Powder productB ingredientB Powder INHALATION
3: 3-3 productC ingredientC Suspension productCC ingredientC Suspension topical
4: 6-6 productF ingredientF Capsule productF ingredientF tablet oral
5: 4-4 productD ingredientD System productDD ingredientDD injection injecatable
6: 4-4 productD ingredientD System productDd <NA> <NA> <NA>

Here is a solution that might feel slightly over-engineered but achieves the expected outcome:
library(dplyr)
library(purrr)
library(stringr)
get_match=function(data, cols, keys){
rtn = ifelse(rowSums(is.na(data[paste0(cols, "_1")]))==rowSums(is.na(data[paste0(cols, "_2")])), paste(keys, collapse="+"), "None")
rtn2 = cols %>%
map(~{
case_when(as.character(data[[paste0(.x, "_1")]])==as.character(data[[paste0(.x, "_2")]])~.x)
}) %>%
reduce(paste, sep="+") %>% str_replace_all("\\+?NA\\+?", "")
paste(rtn, rtn2, sep="+") %>% str_replace_all("\\+$", "")
}
df_out = left_join(df, df_2, by=c("identifer", "key"), suffix=c("_1", "_2")) %>%
mutate(Match = get_match(., cols=c("product", "ingredient", "DF"), keys=c("identifer", "key")),
match_strength = str_count(Match, "\\+")) %>%
group_by(id) %>%
filter(match_strength==max(match_strength, na.rm=TRUE))
dplyr::left_join removes the by keys so the only way I found to add them is to check that all the _1 or the _2 were missing. I could have used the keep=TRUE option and remove/rename them hereafter though...

Related

How to pivot_wider the n unique values of variable A grouped_by variable B?

I am trying to pivot_wider() the column X of a data frame containing various persons names. Within group_by() another variable Y of the df there are always 2 of these names. I would like R to take the 2 unique X names values within each unique identifier of Y and put them in 2 new columns ex_X_Name_1 and ex_X_Name_2.
My data frame is looking like this:
df <- data.frame(Student = rep(c(17383, 16487, 17646, 2648, 3785), each = 2),
Referee = c("Paul Severe", "Cathy Nice", "Jean Exigeant", "Hilda Ehrlich", "John Rates",
"Eva Luates", "Fred Notebien", "Aldous Grading", "Hans Streng", "Anna Filaktic"),
Rating = format(round(x = sqrt(sample(15:95, 10, replace = TRUE)), digits = 3), nsmall = 3)
)
df
I would like to make the transformation of the Referee column to 2 new columns Referee_1 and Referee_2 with the 2 unique Referees assigned to each student and end with this result:
even_row_df <- as.logical(seq_len(length(df$Referee)) %% 2)
df_wanted <- data_frame(
Student = unique(df$Student),
Referee_1 = df$Referee[even_row_df],
Rating_Ref_1 = df$Rating[even_row_df],
Referee_2 = df$Referee[!even_row_df],
Rating_Ref_2 = df$Rating[!even_row_df]
)
df_wanted
I guess I could achieve this with by subsetting unique rows of student/referee combinations and make joins , but is there a way to handle this in one call to pivot_wider?
You should create a row id per group first:
library(dplyr)
library(tidyr)
df %>%
group_by(Student) %>%
mutate(row_n = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = "row_n", values_from = c("Referee", "Rating"))
# A tibble: 5 × 5
Student Referee_1 Referee_2 Rating_1 Rating_2
<dbl> <chr> <chr> <chr> <chr>
1 17383 Paul Severe Cathy Nice 9.165 7.810
2 16487 Jean Exigeant Hilda Ehrlich 5.196 6.557
3 17646 John Rates Eva Luates 7.211 5.568
4 2648 Fred Notebien Aldous Grading 4.000 8.124
5 3785 Hans Streng Anna Filaktic 7.937 6.325
using data.table
library(data.table)
setDT(df)
merge(df[, .SD[1], Student], df[, .SD[2], Student], by = "Student", suffixes = c("_1", "_2"))
# Student Referee_1 Rating_1 Referee_2 Rating_2
# 1: 2648 Fred Notebien 6.708 Aldous Grading 9.747
# 2: 3785 Hans Streng 6.245 Anna Filaktic 8.775
# 3: 16487 Jean Exigeant 7.681 Hilda Ehrlich 4.359
# 4: 17383 Paul Severe 4.583 Cathy Nice 7.616
# 5: 17646 John Rates 6.708 Eva Luates 8.246

R - Merging two dataframe by text

I have two datasets which I want to merge :
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
And I would like to have something like this :
1 residence mozart NA (or terrasses mozart)
2 les hesperides auteuil mirabeau NA
3 chaillot chaillot
4 jouvenet NA
5 retraite dosne retraites dosne
Here is what I did :
x = data.frame(title_df2 = matrix(ncol = 1, nrow = nrow(df1)))
for (i in nbr){
x[i, ] <- grep(df1$title[i], df2$title, value = T)
}
It does not work at all ! Even though grep(df1$title[5], df2$title, value = T) works and return "chaillot"!
If I understand correctly
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
library(dplyr)
library(fuzzyjoin)
stringdist_left_join(x = df1, y = df2, method = "jw", distance_col = "d") %>%
filter(d < 0.25) %>%
right_join(df1, by = c("title.x" = "title"))
#> Joining by: "title"
#> title.x title.y d
#> 1 residence mozart terrasses mozart 0.23863636
#> 2 chaillot chaillot 0.00000000
#> 3 retraite dosne retraites dosne 0.09206349
#> 4 les hesperides auteuil mirabeau <NA> NA
#> 5 jouvenet <NA> NA
Created on 2021-04-19 by the reprex package (v2.0.0)
The issue is that grep returns a vector of length 0 when there is no match.
grep('a', 'hello', value = TRUE)
#character(0)
If we want to make use of the same for loop, make an adjustment in the code to return NA whereever there is no match
nbr <- seq_len(nrow(df1))
for (i in nbr){
x[i, ] <- c(grep(df1$title[i], df2$title, value = TRUE), NA_character_)[1]
}
-output
x
# title_df2
#1 <NA>
#2 <NA>
#3 chaillot
#4 <NA>
#5 <NA>
You could do:
a <-Vectorize(agrep, "pattern")(df1$title, df2$title, value=TRUE)
is.na(a)<- lengths(a) == 0
cbind(df1,df2_title=unlist(a, use.names = FALSE))
title df2_title
1 residence mozart <NA>
2 les hesperides auteuil mirabeau <NA>
3 chaillot chaillot
4 jouvenet <NA>
5 retraite dosne retraites dosne
To achieve your goal, you need a matching on each word of your strings within df1 title.
As used in your example, Grep will return an output only if there is a match on the full string.
In order to do that, you'll need to grep on possible words on df1 that are also contained in df2. This can be achieved by implementing an or condition on the full word contained in each string.
nbr <- 1:nrow(x)
for (i in nbr){
pattern <- paste("\\b",unlist(strsplit(as.character(df1$title[i]), " ")), "\\b", collapse = "|", sep = "") # here you create a regex expression whereby you can check if one of the words contained in 1 is also in df2. the \\b \\b escape makes sure that there is a full match on the single word.
fitInDataFrame <- grep(pattern, as.character(df2$title), value = T) # here you grep on the constructed regex expression
x[i, ] <- ifelse(length(fitInDataFrame) == 0, NA, fitInDataFrame)
}
Here the output:
> x
title_df2
1 terrasses mozart
2 <NA>
3 chaillot
4 <NA>
5 retraites dosne
You can do a left_join(df1, df2, by = c('title' = 'title'), keep = TRUE), specifying keep = TRUE so it doesn't drop df2's join column.
Or, for this particular case, you could do this:
df1$newcol <- ifelse(df1$title %in% df2$title, df1$title, NA)
This adds a new column to df1 which is filled out by going through each title in df1, checking if that title is in df2, if so writing that title in the second column and if not writing NA in that row of the second column. You could choose to put something else there instead, like:
df1$newcol <- ifelse(df1$title %in% df2$title, 'Title in DF2', 'Not in DF2')

How to tidy the data set with column containing multiple information-Sample data put?

Please help me make my data tidy. Thanks.
The total observations is 394, with 26 columns. Data is exported from ms excel.
Data sample is given below. In this sample actually there should be only three observations/rows.
In the vectors d1..d2..no and Farmer.Name the observations corresponding to NA of v1 should be cleared and added to the preceding row value.
the d1..d2..no corresponds to three observations (two date observations one unique identification number )and so do the Farmer.Name vector.
The sample is
d1..d2..no<-c("27/01/2020", "43832", "KE004421", "43832", "43832",
"KE003443", "31/12/2019", "43832", "KE0001512")
Farmer.Name<-c("S Jacob Gender:male","farmer type :marginal","farmer category :general",
"J Isac Gender :Female","farmer type: large","farmer category :general",
"P Kumar Gender :Male","farmer type:small","farmer category :general")
adress<-c("k11",NA,NA,"k12",NA,NA,"k13",NA,NA)
amount<-c(25,NA,NA,25,NA,NA,32,NA,NA)
mydata<-data.frame(v1=v1, d1..d2..no=d1..d2..no, Farmer.Name=Farmer.Name,
adress=adress, amount=amount)
In the vectors d1..d2..no and Farmer.Name the observations corresponding to NA of v1 should be cleared and added to the preceding row value.
the d1..d2..no corresponds to three observations (two date observations one unique identification number )
and so do the Farmer.Name vector. That is, my result expected is like from this code
v1<-c(1,2,3)
d1<-c("27/01/2020","43832","31/12/2019")
d2<-c("43832","43832","43832")
no<-c("KE004421","KE003443","KE0001512")
Farmer.Name1<-c("S Jacob","J Isac","P Kumar")
Gender<-c("male","female","male")
farmer_type <-c("marginal","large","small")
farmer_category <-c("general", "general", "general")
adress<-c("k11","k12","k13")
amount<-c(25,25,32)
myfinaldata<-data.frame(v1=v1,d1=d1,d2=d2,no=no,
Farmer.Name1=Farmer.Name1,
farmer_type=farmer_type,
farmer_category=farmer_category,
adress=adress,amount=amount)
The result should be
v1 d1 d2 no Farmer.Name1 farmer_type farmer_category adress amount
1 1 27/01/2020 43832 KE004421 S Jacob marginal general k11 25
2 2 43832 43832 KE003443 J Isac large general k12 25
3 3 31/12/2019 43832 KE0001512 P Kumar small general k13 32
I am a novice to programming and r, learning through online resources. Also my first post on this platform. Please forgive any mistakes.
I have done a lot of mess with spread,separate, etc of tidy vesre.. But stuck at how to proceed.
Untidy data can be a challenge. Here is a tidyverse approach.
First, added proposed column names expected for d1, d2, and no. Assumes rows are in this order.
Column Farmer.Name is separated into two columns, by :.
The Name itself is separated before the word Gender.
fill allows for common values to be filled in for the same individual (such as v1, adress, amount, and Name).
pivot_wider is done to spread the data wide, first, by d1, d2, and no, and then by the other columns including Gender, farmer_type, and farmer_category.
library(tidyverse)
df1 <- mydata %>%
mutate(d_var = rep(c("d1", "d2", "no"), times = 3)) %>%
separate(Farmer.Name, into = c("Var", "Val"), sep = ":") %>%
separate(Var, into = c("Name", "Var"), sep = "(?=Gender)", fill = "left") %>%
mutate_at(c("Name", "Var"), trimws) %>%
fill(v1, adress, amount, Name, .direction = "down") %>%
mutate(Var = gsub(" ", "_", Var))
df1 %>%
pivot_wider(id_cols = c(v1, Name, adress, amount), names_from = d_var, values_from = d1..d2..no) %>%
left_join(pivot_wider(df1, id_cols = c(v1, Name, adress, amount), names_from = Var, values_from = Val))
Output
# A tibble: 3 x 10
v1 Name adress amount d1 d2 no Gender farmer_type farmer_category
<dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 S Jacob k11 25 27/01/2020 43832 KE004421 male "marginal" general
2 2 J Isac k12 25 43832 43832 KE003443 Female " large" general
3 3 P Kumar k13 32 31/12/2019 43832 KE0001512 Male "small" general
The dates in your data set are not in date format. Consider formatting them after this.
library(reshape)
df.new <- cbind(mydata[seq(1, nrow(mydata), 3), ], mydata[seq(2, nrow(mydata), 3), ][2:3], mydata[seq(3, nrow(mydata), 3), ][2:3])
colnames(df.new) <- c("v1", "d1", "Farmer.Name1", "adress", "amount", "d2", "farmer_type", "no", "farmer_category")
df.new <- df.new[c(1,2,6, 8,3, 7,9, 4,5)]
library(stringr)
df.new$Farmer.Name1 <- word(df.new$Farmer.Name1,1,sep = "\\ Gender")
df.new$farmer_type <- word(df.new$farmer_type,2,sep = "\\:")
df.new$farmer_category <- word(df.new$farmer_category,2,sep = "\\:")
Final table:
> df.new
v1 d1 d2 no Farmer.Name1 farmer_type farmer_category adress amount
1 1 27/01/2020 43832 KE004421 S Jacob marginal general k11 25
4 2 43832 43832 KE003443 J Isac large general k12 25
7 3 31/12/2019 43832 KE0001512 P Kumar small general k13 32
P.S.: I have not renamed the row numbers.

Passing string through multiple filters for matching

Working with MSA data and splitting string then putting them back together. Needing to filter through multiple columns to get strings to match properly. I need to filter the string of cities through states first... I could create a column for each city matched to MSA, but am looking for something more efficient.
> testdf <- data.frame(col1 =c('Dallas,Fort Worth,Arlington','Houston,The Woodlands,Sugar Land','Atlanta,Sandy Springs,Roswell'),
+ col2 =c('TX','TX','GA'))
> df <- data.frame(col1 = c('Arlington','Houston','Arlington','Atlanta'),
+ col2 = c('TX','TX','VA','GA'),
+ stringsAsFactors = FALSE)
> testdf
col1 col2
1 Dallas,Fort Worth,Arlington TX
2 Houston,The Woodlands,Sugar Land TX
3 Atlanta,Sandy Springs,Roswell GA
> df
col1 col2
1 Arlington TX
2 Houston TX
3 Arlington VA
4 Atlanta GA
Looking for:
col1 col2 MSA
1 Arlington TX Dallas,Fort Worth,Arlington
2 Houston TX Houston,The Woodlands,Sugar Land
3 Arlington VA NA
4 Atlanta GA Atlanta,Sandy Springs,Roswell
I'm pretty lost on how to even ask this question, so please let me know if I have a duplicate here. If it is a duplicate, please provide guidance on how to ask better.
I believe this is what you are looking for:
# df to join to
testdf <- data.frame(col1 =c('Dallas,Fort Worth,Arlington','Houston,The Woodlands,Sugar Land','Atlanta,Sandy Springs,Roswell'),
col2 =c('TX','TX','GA'),
stringsAsFactors = FALSE)
# combine df's using join and filter by strings
df <- data.frame(col1 = c('Arlington','Houston','Arlington','Atlanta'),
col2 = c('TX','TX','VA','GA'),
stringsAsFactors = FALSE) %>%
left_join(testdf, by="col2") %>%
set_names(c("city","state","msa")) %>%
filter(str_detect(msa,city))
drop_na()
You can use base R to do this:
vec = sapply(do.call(paste, c(sep = ".*", df)), grep, do.call(paste, testdf))
transform(df, MSA = testdf[unlist(`is.na<-`(vec, !lengths(vec))), 1])
col1 col2 MSA
1 Arlington TX Dallas,Fort Worth,Arlington
2 Houston TX Houston,The Woodlands,Sugar Land
3 Arlington VA <NA>
4 Atlanta GA Atlanta,Sandy Springs,Roswell
Can use dplyr with a tweak to what Yifu mentioned above
df %>%
left_join(testdf %>%
dplyr::rename(MSA = col1)) %>%
dplyr::select(col1, col2, MSA)

How to search part of string that contain in a list of string, and return the matched one in R

The following data frame contain a "Campaign" column, the value of column contain information about season, name, and position, however, the order of these information are quiet different in each row. Lucky, these information is a fixed list, so we could create a vector to match the string inside the "Campaign_name" column.
Date Campaign
1 Jan-15 Summer|Peter|Up
2 Feb-15 David|Winter|Down
3 Mar-15 Up|Peter|Spring
Here is what I want to do, I want to create 3 columns as Name, Season, Position. So these column can search the string inside the campaign column and return the matched value from the list below.
Name <- c("Peter, David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
So my desired result would be following
Temp
Date Campaign Name Season Position
1 15-Jan Summer|Peter|Up Peter Summer Up
2 15-Feb David|Winter|Down David Winter Down
3 15-Mar Up|Peter|Spring Peter Spring Up
Another way:
L <- strsplit(df$Campaign,split = '\\|')
df$Name <- sapply(L,intersect,Name)
df$Season <- sapply(L,intersect,Season)
df$Position <- sapply(L,intersect,Position)
Do the following:
Date = c("Jan-15","Feb-15","Mar-15")
Campaign = c("Summer|Peter|Up","David|Winter|Down","Up|Peter|Spring")
df = data.frame(Date,Campaign)
Name <- c("Peter", "David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
for(k in Name){
df$Name[grepl(pattern = k, x = df$Campaign)] <- k
}
for(k in Season){
df$Season[grepl(pattern = k, x = df$Campaign)] <- k
}
for(k in Position){
df$Position[grepl(pattern = k, x = df$Campaign)] <- k
}
This gives:
> df
Date Campaign Name Season Position
1 Jan-15 Summer|Peter|Up Peter Summer Up
2 Feb-15 David|Winter|Down David Winter Down
3 Mar-15 Up|Peter|Spring Peter Spring Up
I had the same idea as Marat Talipov; here's a data.table option:
library(data.table)
Name <- c("Peter", "David")
Season <- c("Summer","Spring","Autumn", "Winter")
Position <- c("Up","Down")
dat <- data.table(Date=c("Jan-15", "Feb-15", "Mar-15"),
Campaign=c("Summer|Peter|Up", "David|Winter|Down", "Up|Peter|Spring"))
Gives
> dat
Date Campaign
1: Jan-15 Summer|Peter|Up
2: Feb-15 David|Winter|Down
3: Mar-15 Up|Peter|Spring
Processing is then
dat[ , `:=`(Name = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Name),
Season = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Season),
Position = sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, Position))
]
Result:
> dat
Date Campaign Name Season Position
1: Jan-15 Summer|Peter|Up Peter Summer Up
2: Feb-15 David|Winter|Down David Winter Down
3: Mar-15 Up|Peter|Spring Peter Spring Up
Maybe there's some benefit if you're doing this to a lot of columns or need to modify in place (by reference).
I'm interested if anyone can show me how to update all three columns at once.
EDIT: Never mind, figured it out;
for (icol in c("Name", "Season", "Position"))
dat[, (icol):=sapply(strsplit(Campaign, "|", fixed=TRUE), intersect, get(icol))]

Resources