Merge / match two variables with one group of variables from another dataframe - r

I have two data.frames df.1 and df.2 that I would merge or otherwise select data from to create a new data.frame. df.1 contains information about each individual (ID), sampling event (Event), Site and sample number (Sample). The tricky part for me is that Site and the corresponding Sample for each ID-Event pairing is different. For example, F3-3 has Site "plum" for Sample "1" and M6-3 has Site "pear" for Sample "1".
df.2 has Sample1 and Sample2 which corresponds to the Sample information in df.1 by way of the ID-Event pairing.
I'd like to match/merge the information between these two data.frames. Essentially, get the "word" from Site in df.1 that matches the Sample number. An example (df.3) is below.
Each ID-Event pairing will only have one Site and corresponding Sample (e.g. "Apple" will correspond to "1" not to "1" and "4"). I know I could use merge if I was only matching, for example, Sample1 or Sample2 I am not sure how to do this with both to populate Site1 and Site2 with the correctly matched word.
df.1 <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("F1",
"F3", "M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "M"), class = "factor"), Event = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L), Site = structure(c(1L, 3L, 9L, 7L, 8L, 10L,
2L, 6L, 4L, 5L, 1L, 9L, 7L, 8L, 10L, 5L, 10L, 2L, 6L, 4L, 5L,
1L, 9L, 2L, 6L, 4L, 5L, 1L, 8L, 3L, 10L, 4L, 2L, 6L, 4L, 5L,
1L), .Label = c("Apple", "Banana", "Grape", "Guava", "Kiwi",
"Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
Sample = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L)), .Names = c("ID",
"Sex", "Event", "Site", "Sample"), class = "data.frame", row.names = c(NA,
-37L))
#
df.2 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F1",
"M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
Event = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1",
"Sample2", "V1", "V2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA,
-12L))
#
df.3 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), Site1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Apple",
"Banana"), class = "factor"), Site2 = structure(c(2L, 8L, 6L,
7L, 9L, 1L, 5L, 3L, 4L, 5L, 3L, 4L), .Label = c("Banana", "Grape",
"Guava", "Kiwi", "Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L), .Label = c("F1", "M6"), class = "factor"), Sex = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), Event = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1", "Sample2",
"V1", "V2", "Site1", "Site2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA, -12L))

Two merges should do it:
first <- merge(df.2, unique(df.1[,3:5]), by.x=c("Sample1","Event"), by.y=c("Sample","Event"), all.x=TRUE)
second <- merge(first, unique(df.1[,3:5]),by.x=c("Sample2","Event"), by.y=c("Sample","Event"), all.x=TRUE)
print(second)
Sample2 Event Sample1 V1 V2 ID Sex Site.x Site.y
1 10 1 1 0.000 0.001 F1 F Apple Kiwi
2 2 1 1 0.120 0.107 F1 F Apple Grape
3 3 1 1 0.497 0.273 F1 F Apple Pear
4 3 3 2 0.001 0.107 M6 M Banana Mango
5 4 1 1 0.715 0.595 F1 F Apple Orange
6 4 3 2 0.000 0.273 M6 M Banana Guava
7 5 1 1 0.000 0.000 F1 F Apple Peach
8 5 3 2 0.829 0.595 M6 M Banana Kiwi
9 6 1 1 0.001 0.004 F1 F Apple Plum
10 7 1 1 0.000 0.000 F1 F Apple Banana
11 8 1 1 0.829 0.547 F1 F Apple Mango
12 9 1 1 0.000 0.001 F1 F Apple Guava

Related

Frequency of one dataframe rows from another dataframe

Can someone help me how to count from another dataframe?
df1(out)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), class = "factor", .Label = "0S1576"), LC = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), class = "factor", .Label = c("MW92",
"OY01", "RM11")), Fiscal.Month = c("2019-M06", "2019-M07", "2019-M06",
"2019-M07", "2019-M08", "2019-M09", "2019-M06", "2019-M07", "2019-M08"
)), row.names = c(NA, -9L), class = "data.frame")
df2(tempdf)
structure(list(Item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "0S1576", class = "factor"),
LC = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 4L, 6L, 5L, 1L,
2L, 2L, 3L, 3L), .Label = c("MW92", "OY01", "RM11", "RS11",
"WK14", "WK15"), class = "factor"), Fiscal.Month = structure(c(1L,
2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2019-M06",
"2019-M07", "2019-M08", "2019-M09"), class = "factor"), fcst = c(22L,
21L, 20L, 19L, 12L, 10L, 10L, 12L, 10L, 12L, 10L, 10L, 10L,
10L)), row.names = c(NA, -14L), class = "data.frame")
I want to count the frequency of Item,LC,Fiscal.month of df1 from df2
You can count using table and merge df1 with df2 by using factor and you need interaction as you use more than one column to merge.
table(factor(interaction(df2[c("Item","LC","Fiscal.Month")]), levels=interaction(df1)))
#0S1576.MW92.2019-M06 0S1576.MW92.2019-M07 0S1576.OY01.2019-M06
# 2 1 3
#0S1576.OY01.2019-M07 0S1576.OY01.2019-M08 0S1576.OY01.2019-M09
# 0 0 0
#0S1576.RM11.2019-M06 0S1576.RM11.2019-M07 0S1576.RM11.2019-M08
# 3 0 0
Or a speed improved version using match and tabulate:
(df1$freq <- tabulate(match(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1)))
#[1] 2 1 3 0 0 0 3 0 0
Or sometimes even faster using fastmatch:
library(fastmatch)
df1$freq <- tabulate(fmatch(interaction(df2[c("Item","LC","Fiscal.Month")]), interaction(df1)), nrow(df1))

how to count the number of rows of specific column that has specific character

I have data that I want to know the number of specific rows that are with specific character. The data looks like the following
df<-structure(list(Gene.refGene = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), .Label = c("A1BG", "A1BG-AS1", "A1CF",
"A1CF;PRKG1"), class = "factor"), Chr = structure(c(2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("chr10", "chr19"
), class = "factor"), Start = c(58858232L, 58858615L, 58858676L,
58859052L, 58859055L, 58859066L, 58859510L, 58863162L, 58864479L,
58864150L, 58864867L, 58864879L, 58865857L, 52566433L, 52569637L,
52571047L, 52573510L, 52576068L, 52580561L, 52603659L, 52619845L,
52625849L, 52642500L, 52650951L, 52675605L, 52703952L, 52723140L,
52723638L), End = c(58858232L, 58858615L, 58858676L, 58859052L,
58859055L, 58859066L, 58859510L, 58863166L, 58864479L, 58864150L,
58864867L, 58864879L, 58865857L, 52566433L, 52569637L, 52571047L,
52573510L, 52576068L, 52580561L, 52603659L, 52619845L, 52625849L,
52642500L, 52650958L, 52675605L, 52703952L, 52723140L, 52723638L
), Ref = structure(c(3L, 5L, 2L, 2L, 3L, 2L, 5L, 7L, 6L, 6L,
2L, 1L, 5L, 6L, 5L, 3L, 2L, 5L, 6L, 3L, 3L, 6L, 3L, 4L, 3L, 6L,
6L, 3L), .Label = c("-", "A", "C", "CTCTCTCT", "G", "T", "TTTTT"
), class = "factor"), Alt_df1 = structure(c(1L, 1L, 4L, 4L, 1L,
4L, 5L, 1L, 3L, 3L, 4L, 4L, 3L, 1L, 2L, 5L, 1L, 2L, 1L, 5L, 5L,
2L, 5L, 1L, 4L, 3L, 4L, 2L), .Label = c("-", "A", "C", "G", "T"
), class = "factor")), class = "data.frame", row.names = c(NA,
-28L))
I want to know how many rows of the column named "alt_df1" is missing or - or NA
Here is an answer using which and utilising base R's LETTERS data:
length(which(!df$Alt_df1%in%LETTERS))
#[1] 8
Or using just which:
length(which(df$Alt_df1=="-"))
#[1] 8
One way would be to create a logical vector using %in% and then sum over them to count the number of occurrences.
sum(df$Alt_df1 %in% c("-", NA))
#[1] 8
Or we can also subset and count the number of rows.
nrow(subset(df, Alt_df1 %in% c("-", NA)))
which can also be done in dplyr by
library(dplyr)
df %>% filter(Alt_df1 %in% c("-", NA)) %>% nrow
Another option using grepl
with(df, sum(grepl("-", Alt_df1)) + sum(is.na(Alt_df1)))
and I am sure there are multiple other ways.

Add a column by counting words for each row in R code

I have a data frame of 2511 rows and 6 columns with candy and color items. Please see the first 15 rows as below:
structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
I don`t know how can I count in new columns only the kind of candy that each row has. This first line as an expected ouput of the resulting data frame:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
I'm stuck and I'd appreciate a little help.
you can use apply function to apply grepl function by row for the initial data frame. Then you use sapply to iterate through four ingridients you indicated. Then use cbind to concatentate the initial data frame and the data frame with ingedients into one. Please see the code below:
# initialize data frame
df <- structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
# counting ingridients
ingridients <- c("dulce", "miel", "chocolate", "cookie")
x <- sapply(ingridients, function(y) apply(df, 1, function(x) sum(grepl(y, x))))
df_res <- cbind(df, x)
head(df_res)
Output:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
2 2 {dulce1_rojo dulce2_verde chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{miel21_amarillo} 2 1 2 1
3 3 {dulce1_rojo dulce2_verde miel21_amarillo chocolate32_violeta cookie32_violeta} >{chocolate2l_amarillo} 2 1 2 1
4 4 {dulce1_rojo dulce2_verde miel21_amarillo chocolate2l_amarillo cookie32_violeta} >{chocolate32_violeta} 2 1 2 1
5 5 {miel30_azul chocolate2l_amarillo chocolate30_azul cookie30_azul cookie2l_amarillo} >{miel21_amarillo} 0 2 2 2
6 6 {miel21_amarillo miel30_azul chocolate30_azul cookie30_azul cookie2l_amarillo} >{chocolate2l_amarillo} 0 2 2 2

Compare columns and put the output in additional column

Let's start with the example of the data:
structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple",
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L,
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange",
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"),
P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair",
"Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"),
P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge",
"Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L,
3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed",
"Table,Shelf,Fridge"), class = "factor")), .Names = c("P1",
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon",
"P2_location_all_predictors"), class = "data.frame", row.names = c(NA,
-20L))
I would like to compare the two pairs of column. First pair which I would like to comapre is P1_location_subacon with P2_location_subacon. The second pair is P1_location_all_predictors with P2_location_all_predictors.
How I want to compare them ? In each column you have different "locations" of the fruit/vegetable. So:
if the location is the same in the first pair (P1/2_location_subacon) I would like to put number 2 in the additional column.
if the location is the same in the second pair (P1/2_location_all_predictors) I would like to put number 1 in the additional column. That one is a bit more complicated because not all of the locations have to be the same. At least one of them has to be the same for both fruits/vegetables.
if in both cases they are different put 0. You won't see such situation in the example data.
To summarize I show you the output which I would like to achieve:
structure(list(P1 = structure(c(1L, 1L, 3L, 3L, 5L, 5L, 5L, 5L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L), .Label = c("Apple",
"Grape", "Orange", "Peach", "Tomato"), class = "factor"), P2 = structure(c(4L,
4L, 3L, 3L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 6L, 6L), .Label = c("Banana", "Cucumber", "Lemon", "Orange",
"Potato", "Tomato"), class = "factor"), P1_location_subacon = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Fridge", "Table"), class = "factor"),
P1_location_all_predictors = structure(c(2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Table,Desk,Bag,Fridge,Bed,Shelf,Chair",
"Table,Shelf,Cupboard,Bed,Fridge", "Table,Shelf,Fridge"), class = "factor"),
P2_location_subacon = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Fridge",
"Shelf"), class = "factor"), P2_location_all_predictors = structure(c(3L,
3L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("Shelf,Fridge", "Shelf,Fridge,Bed",
"Table,Shelf,Fridge"), class = "factor"), X = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Correct = c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), .Names = c("P1",
"P2", "P1_location_subacon", "P1_location_all_predictors", "P2_location_subacon",
"P2_location_all_predictors", "X", "Correct"), class = "data.frame", row.names = c(NA,
-20L))
EDIT: using feedback from here Test two columns of strings for match row-wise in R I have improved my answer.
Where DT is your table:
library(data.table)
setDT(DT)
DT <- data.table(sapply(DT,as.character))
DT[, P1_location_all_predictors := gsub(",","|",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub(",","|",P1_location_subacon)]
DT[, match_all_pred := grepl(P1_location_all_predictors, P2_location_all_predictors) + 0, by = P1_location_all_predictors]
DT[, match_subacon := grepl(P1_location_subacon, P2_location_subacon), by = P1_location_subacon]
DT[, P1_location_all_predictors := gsub("\\|",",",P1_location_all_predictors)]
DT[, P1_location_subacon := gsub("\\|",",",P1_location_subacon)]
I instead opted for two columns vs your 0/1/2 notation; it makes the code less straightforward as you have to rely on nested ifs. I also think that multiple columns is better as you can clearly see the F/F, T/F, F/T, and T/T cases.
If you must create the 0/1/2, you can call
DT[, MyCol := match_all_pred - match_subacon*match_all_pred+match_subacon*2]
which assumes that subacon supersedes the all location.
Here is another way:
myData <- data.frame(sapply(myData, as.character), stringsAsFactors=FALSE)
doesIntersect <- function(setA, setB) {length(intersect(setA,setB)) > 0}
myData$Correct <- 0
myData$Correct[mapply(doesIntersect, strsplit(myData$P1_location_all_predictors, ","), strsplit(myData$P2_location_all_predictors, ","))] <- 1
myData$Correct[mapply(setequal, strsplit(myData$P1_location_subacon, ","), strsplit(myData$P2_location_subacon, ","))] <- 2
> myData$Correct
[1] 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2

Print a-priori contrasts with type III sums of squares using Anova() in R

I am trying to print a-priori contrasts with type III sums of squares results. (Please don't speak about type I vs. type III. That's not the point of my question.) I can print the contrasts like I need using summary.aov(), however that uses type I SS. When I use the Anova() function from library(car) to get type III SS, it doesn't print the contrasts. I have also tried using drop1() with the lm() model, but this just prints the same results as Anova() (without the contrasts).
Please advise on a way to print the results of the contrasts with type III SS. An example follows.
Sample data:
DF <- structure(list(Code = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L,
9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L), .Label = c("A",
"B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"), class =
"factor"), GzrTreat = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), contrasts = structure(c(1,
-2, 1, 1, 0, -1), .Dim = c(3L, 2L), .Dimnames = list(c("I",
"N", "R"), NULL)), .Label = c("I", "N", "R"), class = "factor"),
BugTreat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label =
c("Immigration", "Initial", "None"), class = "factor"), TempTreat =
structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Not Warm", "Warmed"), class =
"factor"), ShadeTreat = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Light",
"Shaded"), class = "factor"), EpiChla = c(0.268482353, 0.423119608,
0.579507843, 0.738839216, 0.727856863, 0.523960784, 0.405801961,
0.335964706, 0.584441176, 0.557543137, 0.436456863, 0.563909804,
0.432398039, 0.344956863, 0.340309804, 0.992884314, 0.938390196,
0.663270588, 0.239833333, 0.62875098, 0.466011765, 0.536182353,
0.340309804, 0.721172549, 0.752082353, 0.269372549, 0.198180392,
1.298882353, 0.298354902, 0.913139216, 0.846129412, 0.922317647,
0.727033333, 1.187662745, 0.35622549, 0.073547059), log_EpiChla =
c(0.10328443, 0.153241402, 0.198521787, 0.240259426, 0.237507762,
0.182973791, 0.147924145, 0.125794985, 0.19987612, 0.192440084,
0.157292589, 0.194211702, 0.156063718, 0.128708355, 0.127205194,
0.299482089, 0.287441205, 0.220962908, 0.093363308, 0.21185469,
0.166137456, 0.186442772, 0.127205194, 0.235824411, 0.243554515,
0.103589102, 0.078522208, 0.361516746, 0.113393422, 0.281746574,
0.266262141, 0.283825153, 0.23730072, 0.339980371, 0.132331903,
0.030821087), MeanZGrowthAFDM_g = c(0.00665, 0.003966667, 0.004466667,
0.01705, 0.0139, 0.0129, 0.0081, 0.003833333, 0.00575, 0.011266667,
0.0103, 0.009, 0.0052, 0.00595, 0.0105, 0.0091, 0.00905, 0.0045, 0.0031,
0.006466667, 0.0053, 0.009766667, 0.0181, 0.00725, 0, 0.0012, 5e-04,
0.0076, 0.00615, 0.0814, NA, 0.0038, 0.00165, 0.0046, 0, 0.0015)),
.Names = c("Code", "GzrTreat", "BugTreat", "TempTreat", "ShadeTreat",
"EpiChla", "log_EpiChla", "MeanZGrowthAFDM_g"), class = "data.frame",
row.names = c(NA, -36L))
Code:
## a-priori contrasts
library(stats)
contrasts(DF$GzrTreat) <- cbind(c(1,-2,1), c(1,0,-1))
round(crossprod(contrasts(DF$GzrTreat)))
c_labels <- list(GzrTreat=list('presence'=1, 'immigration'=2))
## model
library(car)
EpiLM <- lm(log_EpiChla~TempTreat*GzrTreat*ShadeTreat, DF)
summary.aov(EpiLM, split=c_labels) ### MUST USE summary.aov(), to get
#contrast results, but sadly this uses Type I SS
Anova(EpiLM, split=c_labels, type="III") # Uses Type III SS, but NO
#CONTRASTS!!!!!
drop1(EpiLM, ~., test="F") # again, this does not print contrasts
# I need contrast results like from summary.aov(), AND Type III SS
# like from Anova()

Resources