Related
I have a data frame with different variables (columns).
I want to transform this data frame into a table with a different structure to make it more readable.
For example, I have a data frame like this:
myData = structure(list(X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "20", class = "factor"),
Y = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("20", "100"), class = "factor"),
MethodType = structure(c(2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L,
2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L), .Label = c("E", "Q", "R",
"W"), class = "factor"), MethodType2 = structure(c(1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), Metric1 = c(0.970017512487058, 0.969647220975651,
0.965873991040769, 0.966242788535318, 0.986725852301671,
0.98696657967457, 0.98252107117733, 0.982655296614757, 0.278826941542694,
-0.990926101696033, 0.194574672498287, 0.281916524368647,
0.152983364411985, 1.44135982835554, 0.330270447575806, -0.369627160641594
), Metric2 = c(0.987541353383459, 0.987007518796992, 0.980984962406015,
0.981646616541353, 0.984082706766917, 0.984481203007519,
0.988165413533835, 0.988375939849624, -0.109331599015822,
-0.148471161609603, 1.31331396089969, -1.34238564643737,
2.14014350779371, -0.422879539464588, -1.25706359685425,
1.09603324772565)), row.names = c(NA, -16L), class = "data.frame")
and I want to have a table like this:
Which kind of manipulation I can use? Which tool I can use. I'm looking for something flexible that can work also with more factors.
I have several dataframes with data from the same survey. I want to combine them for analysis. The dataframes contain both unique variables and two variables (ID and Contest_no) that are shared across all the dataframes; the two shared variables contain information about the respondent and the contest number (1,2,3, as respondents were asked the same questions three times).
The difficulty is that the dataframes have missing values:
DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
V4 = factor(c("A", "C", "D")),
ID = factor(c("x1", "y2", "y2")),
Contest_no = factor(c("1", "1", "2")))
DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
V6 = factor(c("A", "B", "C")),
ID = factor(c("x1", "x1", "y2")),
Contest_no = factor(c("1", "2", "1")))
As a result, respondent IDs and contest numbers aren't aligned. I want to match the data to respondent IDS and contest numbers so that the merged dataframe looks like this:
DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
V3 = factor(c("A", NA, "C", "D")),
V4 = factor(c("A", NA, "C", "D")),
V5 = factor(c("A", "B", "C", NA)),
V6 = factor(c("A", "B", "C", NA)),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
I thought that full_join would do the trick, but DF_merged <- full_join(DF1, DF2, DF3, by="ID") gives me nonsensical results.
How can disparate data like this be combined?
New, updated example (to address the problem of multiplied rows). In this example there are no missing values at all, and both dataframes have the same number of rows, but the code results in multiplied rows. First, the two dataframes to be merged:
df1:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L,
2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234",
"V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
df2:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L,
3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252",
"V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
and now the unsuccessful attempt to combine the two dataframes:
df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))
results in:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
), class = "factor"), Attribute1.x = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor"), Option.y = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L),
Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L,
2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
), class = "factor"), Attribute1.y = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x",
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x",
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y",
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))
You can try dplyr::full_join with by=c("ID","Contest_no") argument as:
library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
full_join(DF3, by=c("ID","Contest_no"))
df1
# V1 V2 V3 V4 V5 V6 ID Contest_no
#1 A A A A A A x1 1
#2 B B <NA> <NA> B B x1 2
#3 C C C C C C y2 1
#4 D D D D <NA> <NA> y2 2
Updated: Answer has been modified to consider another column Option in full_join as:
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option"))
Note: I had to tweak my dplyr to match what is suggested by #Gregor in order to get expected result.
I would like to make a "nested" sort of table in R that mirrors the formatting of a plot I can make with ggplot using facet_wrap.
Here are some data and the code:
tabledata = structure(list(row = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,1L, 2L, 1L, 2L, 1L, 2L),
col = c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L),
grp1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor"),
grp2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("g", "h"), class = "factor"),
value = c(9L, 9L, 14L, 8L, 10L, 9L, 8L, 15L, 2L, 1L, 3L, 4L, 1L, 5L, 2L, 4L)),
.Names = c("row", "col", "grp1", "grp2", "value"), class = "data.frame",
row.names = c(NA, -16L))
ggplot(tabledata, aes(grp2, value, shape = grp1)) + geom_jitter() + facet_grid(row ~ col)
Which produce this plot:
Here is the table I would like to make (which can easily be done with a pivot table, but obviously that is not ideal):
A nested table can be made using the tabular() function in the tables package using the following code.
tabular(
(Heading()*Factor(row)*Heading()*grp1)~
(Heading()*Factor(col)*Heading()*grp2)*Heading()*value*Heading()*identity,
data = tabledata)
The table can then be saved as a .csv file using write.csv.tabular().
Tidyverse just added a table package that has the nested format built in. It's called "gt" (great tables) https://blog.rstudio.com/2020/04/08/great-looking-tables-gt-0-2/
I am severly struggling with a data-rearrangement problem. The data below contains agreements (rows) which collapsed or were stable (column "collapse") and feature provisions which were reduced, kept, added or absent (columns "diff.pps_leadership","diff.pps_cabinet", etc.)
I want to rearrange the data so that I get an overview of how many % of those agreements which reduced, kept, or added a specific provision collapsed. The rows should be the provisions (diff.pps_leadership...), the columns should be "reduced, "kept", and "added". And the content of the cells should be the % of those collapsed (only in relation to those which reduced, kept, or added the provision; not the total).
In Excle I would do this in pivot table, but I haven't been able to get there with R. I tried the cast, aggregate, melt, and transpose commands, but haven't succeeded.
Eventually, the result should look similar to this
https://docs.google.com/spreadsheets/d/1yhIbvTQTYkkwSFVxWEnPwvSvwTc0vuTYZxa15Eh1lT8/edit?usp=sharing
Hope my question is not too specific. Grateful for any hint/advice.
example <- structure(list(Agreement = structure(c(8L, 4L, 6L, 9L, 2L, 3L,
7L, 10L, 5L, 1L), .Label = c("Abuja Agreement", "Accra Peace Agreement",
"Arusha Agreement", "Arusha/Global Ceasefire Agreement", "Comprehensive Peace Agreement",
"InterabsentCongolese Dialogue", "Lome Agreement", "Lusaka Protocol",
"Ouagadougou Agreement", "Tansitional Constituion"), class = "factor"),
diff.pps_cabinet = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("kept", "reduced"), class = "factor"),
diff.pps_leadership = structure(c(1L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.mps_milcmd = structure(c(3L, 2L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.mps_armyint = structure(c(3L, 2L, 2L, 3L, 3L, 3L, 1L,
3L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.eps_commission = structure(c(1L, 1L, 1L, 1L, 3L, 1L,
3L, 1L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.eps_company = structure(c(1L, 2L, 1L, 1L, 3L, 1L, 1L,
1L, 2L, 3L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.veto_leg = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("absent", "added"), class = "factor"),
diff.tps_devolution = structure(c(2L, 1L, 2L, 3L, 1L, 1L,
1L, 2L, 2L, 1L), .Label = c("absent", "kept", "reduced"), class = "factor"),
diff.ca.psh = structure(c(3L, 2L, 1L, 1L, 4L, 1L, 1L, 1L,
4L, 1L), .Label = c("absent", "added", "kept", "reduced"), class = "factor"),
collapse = structure(c(1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L), .Label = c("collapse", "stable"), class = "factor")), .Names = c("Agreement",
"diff.pps_cabinet", "diff.pps_leadership", "diff.mps_milcmd",
"diff.mps_armyint", "diff.eps_commission", "diff.eps_company",
"diff.veto_leg", "diff.tps_devolution", "diff.ca.psh", "collapse"
), class = "data.frame", row.names = c(NA, -10L))
The following gets the job done.
library(data.table)
setDT(example)
mvs <- c("diff.pps_cabinet", "diff.pps_leadership",
"diff.mps_milcmd", "diff.mps_armyint")
vls <- c("reduced", "kept", "added", "absent")
melt(example, c("Agreement", "collapse"), mvs
)[ , setNames(vapply(
vls, function(vv) list(paste0(
s <- sum(collapse[idx <- value == vv] == "collapse"),
" out of ", sum(idx), " = ", floor(100 * s / sum(idx)), "% collapsed"),
paste(Agreement[idx], collapse = "\n")),
vector("list", 2)),
paste0(rep(vls, each = 2),
c(".percent", ".names"))), by = variable]
Current prints NaN when there's nothing; to fix this, replace sum(idx) in the denominator by (if (!any(idx)) 1 else sum(idx)).
Let's start with the example of the data:
structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple",
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L,
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange",
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"),
P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair",
"Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"),
P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge",
"Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L,
3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed",
"Table,Shelf,Fridge"), class = "factor")), .Names = c("P1",
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon",
"P2_location_all_predictors"), class = "data.frame", row.names = c(NA,
-20L))
I would like to compare the two pairs of column. First pair which I would like to comapre is P1_location_subacon with P2_location_subacon. The second pair is P1_location_all_predictors with P2_location_all_predictors.
How I want to compare them ? In each column you have different "locations" of the fruit/vegetable. So:
if the location is the same in the first pair (P1/2_location_subacon) I would like to put number 2 in the additional column.
if the location is the same in the second pair (P1/2_location_all_predictors) I would like to put number 1 in the additional column. That one is a bit more complicated because not all of the locations have to be the same. At least one of them has to be the same for both fruits/vegetables.
if in both cases they are different put 0. You won't see such situation in the example data.
To summarize I show you the output which I would like to achieve:
structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple",
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L,
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange",
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"),
P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair",
"Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"),
P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge",
"Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L,
3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed",
"Table,Shelf,Fridge"), class = "factor"), X = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Correct = c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), .Names = c("P1",
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon",
"P2_location_all_predictors", "X", "Correct"), class = "data.frame", row.names = c(NA,
-20L))
EDIT: using feedback from here Test two columns of strings for match row-wise in R I have improved my answer.
Where DT is your table:
library(data.table)
setDT(DT)
DT <- data.table(sapply(DT,as.character))
DT[, P1_location_all_predictors := gsub(",","|",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub(",","|",P1_location_subacon)]
DT[, match_all_pred := grepl(P1_location_all_predictors, P2_location_all_predictors) + 0, by = P1_location_all_predictors]
DT[, match_subacon := grepl(P1_location_subacon, P2_location_subacon), by = P1_location_subacon]
DT[, P1_location_all_predictors := gsub("\\|",",",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub("\\|",",",P1_location_subacon)]
I instead opted for two columns vs your 0/1/2 notation; it makes the code less straightforward as you have to rely on nested ifs. I also think that multiple columns is better as you can clearly see the F/F, T/F, F/T, and T/T cases.
If you must create the 0/1/2, you can call
DT[, MyCol := match_all_pred - match_subacon*match_all_pred+match_subacon*2]
which assumes that subacon supersedes the all location.
Here is another way:
myData <- data.frame(sapply(myData, as.character), stringsAsFactors=FALSE)
doesIntersect <- function(setA, setB) {length(intersect(setA,setB)) > 0}
myData$Correct <- 0
myData$Correct[mapply(doesIntersect, strsplit(myData$P1_location_all_predictors, ","), strsplit(myData$P2_location_all_predictors, ","))] <- 1
myData$Correct[mapply(setequal, strsplit(myData$P1_location_subacon, ","), strsplit(myData$P2_location_subacon, ","))] <- 2
> myData$Correct
[1] 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2