calculate median by groups with creating categorical variable in R - r

Here example of my data
dput(mydat)
structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
I want calculate the median for countlike by ID group
library(psych)
describeBy(mydat,mydat$ID.group)
but i didn't get needed result, i get all descriptive statistics.
How can i get results like
ID group median countlike
10150591 50
68837296 1
Then how calculate categorical variable for UserID?
For example. Median for ID group=10150591 is 50, then if userid=30240768 has value by countlike on 25% more than the median of this group then "red".
=50/100*25=12.5 25% percentage from 50=12.5. So 50+12.5=62.5, If userid=30240768 has value more then 62.5 by countlike then "red"
i.e. userid=30240768 has value 99. so he is "red".
If userid has value on 25% less than the median by this group then "green". 50-12.5=37.5, here not such value. And last, if value in range ±24% from median for group then "orange". 24% from 50 =50/100*24=12, so if userid has value by countlike 50 ± 12 (38-62) then "orange".
So desired output
ID group UserID countlike median countlike
10150591 181078814 44 orange
10150591 88578209 50 orange
10150591 30240768 99 red
68837296 334686951 1 green
68837296 297170412 1 green
68837296 265332359 15 red
How do I comply with such conditions?

Here is an answer using dplyr. We aggregate the data to medians, merge the medians with the original data, and then calculate color.
First, we read the dput() data from the OP and remove rows that are missing.
data <- structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
data <- data[!is.na(data$ID.group),]
Next, we load dplyr and calculate the desired output.
library(dplyr)
data %>% group_by(ID.group) %>%
summarise(.,mdn_countlike = median(countlike)) %>%
inner_join(.,data) %>%
mutate(color = case_when(countlike > 1.25 * mdn_countlike ~ "red",
countlike < 0.75 * mdn_countlike ~ "green",
countlike >= 0.75 * mdn_countlike &
countlike <= 1.25 * mdn_countlike ~ "orange")) -> mergedData
mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
...and the output:
> mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
# A tibble: 8 x 5
ID.group UserID countlike mdn_countlike color
<int> <int> <int> <dbl> <chr>
1 10150591 181078814 44 50 orange
2 10150591 88578209 50 50 orange
3 10150591 30240768 99 50 red
4 68837296 334686951 1 1 orange
5 68837296 297170412 1 1 orange
6 68837296 265332359 15 1 red
7 124771228 216632504 41 30.5 red
8 124771228 5272133 20 30.5 green
>

Related

Renaming levels in multiple factors in a dataframe

I have multiple factors in a dataframe that each contain levels with names "Very long text 1" and "Very long text 2". I want to rename the levels to "1" and "2". I can easily do it for each individual factor with
levels(df$factorname1)
[levels(df$factorname1)=="Very long text 1"] <- "1"
But it's cumbersome to repeat it for a few hundred factors. Is there a way to rename the level for a range of factors or simply locate "Very long text 1" anywhere in the dataframe and rename it?
Example data:
structure(list(Q5.2.01 = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"A whole different level\n"), class = "factor"), Q5.2.02 = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L), .Label = c("", "Very long text 2\n"), class = "factor"),
Q5.2.03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Q5.2.04 = structure(c(1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "Very long text 2\n"), class = "factor"),
Q5.2.05 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L), .Label = c("", "A whole different level\n",
"Very long text 2 blablabla\n"), class = "factor"), Q5.2.06 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L), .Label = c("", "Very long text 2\n"), class = "factor"),
Q5.2.07 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Q5.2.08 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Q5.2.09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "Very long text 1\n"), class = "factor"),
Q5.2.10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Q5.2.11 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "Very long text 2\n"), class = "factor"),
Q5.2.12 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Very long text 2\n"
), class = "factor"), Q5.2.13 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Very long text 1\n"), class = "factor"), Q5.2.14 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "Very long text 1\n", "Very long text 2\n"
), class = "factor"), Q5.2.15 = structure(c(1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Very long text 2\n"), class = "factor"), Q5.2.16 = structure(c(1L,
1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L), .Label = c("", "Very long text 1\n", "Very long text 2\n"
), class = "factor"), respondentID = structure(c(8L, 8L,
8L, 6L, 7L, 7L, 5L, 5L, 5L, 4L, 4L, 4L, 3L, 3L, 3L, 2L, 2L,
1L), .Label = c("EO13", "EO15", "EO17", "EO19", "EO21", "Eo23",
"EO23", "EO24"), class = "factor")), .Names = c("Q5.2.01",
"Q5.2.02", "Q5.2.03", "Q5.2.04", "Q5.2.05", "Q5.2.06", "Q5.2.07",
"Q5.2.08", "Q5.2.09", "Q5.2.10", "Q5.2.11", "Q5.2.12", "Q5.2.13", `"Q5.2.14", "Q5.2.15", "Q5.2.16", "respondentID"), class = "data.frame",` row.names = c(NA,
-18L))
You can use the revalue function from the package plyr. (It can also be done with base R, but I like this solution).
Here's an example
> DF <- data.frame(V1 = factor(c("A", "B", "C", "A", "D", "E")),
V2=factor(c("A", "A", "A", "A", "D", "E")))
> DF
V1 V2
1 A A
2 B A
3 C A
4 A A
5 D D
6 E E
Now let's assume that the factor level D is the one we wish to replace. Then we can use lapply to iterate over columns in the data frame (remember to only select the relevant factors), and revalue to specify the replacement(s). Wrap everything in as.data.frame to convert back to a data frame.
> library("plyr")
> as.data.frame(lapply(DF, function(x) { revalue(x, c("D"="YAY")) }))
V1 V2
1 A A
2 B A
3 C A
4 A A
5 YAY YAY
6 E E
Update
You can restrict attention to factors by adding a subset selection to the data frame
as.data.frame(lapply(DF[,sapply(DF, class) == "factor"], function(x) { revalue(x, c("D"="YAY")) }))

Counting number of words, seperated by comma ",", in each column of a data frame in R

I have a dataset I downloaded from The Human Protein Atlas which has annotations for the subcellular localization of 12,004 proteins. This file I've subset to only include "Gene name" and then 4 columns for how reliable that location is (based on immunofluorescently stained cells). Theses are "Validated">"Supported">"Approved">"Uncertain".
I've came up with a scoring system I would like to apply to LC-MS spectral count dataset I have by 1) weighing the quality of annotation and 2) penalizing how many locations the protein is found in image of proposed scoring system.
The TLDR is that I need to count how many terms there is in each column of the following data set and get a dataframe of this information.
df <- read.csv("proteinAtlas.csv")
dput(df)
structure(list(Gene_symbol = structure(1:49, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM",
"ACADS", "ACADVL", "ACAP1", "ACAP2", "ACAT1", "ACAT2", "ACBD3",
"ACBD5", "ACIN1", "ACLY", "ACO2", "ACOT1", "ACOT13", "ACOT2",
"ACOT7", "ACOT8", "ACOT9", "ACOX1", "ACP1", "ACP5"), class = "factor"),
Validated = structure(c(1L, 2L, 1L, 1L, 2L, 4L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
5L, 1L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 5L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L), .Label = c("", "Cytosol",
"Golgi apparatus", "Mitochondria", "Peroxisomes", "Vesicles"
), class = "factor"), Supported = structure(c(1L, 9L, 1L,
1L, 1L, 1L, 1L, 1L, 5L, 10L, 10L, 12L, 1L, 1L, 1L, 1L, 4L,
1L, 1L, 6L, 1L, 3L, 1L, 11L, 1L, 10L, 2L, 1L, 1L, 10L, 10L,
1L, 1L, 1L, 4L, 8L, 1L, 11L, 7L, 10L, 1L, 1L, 1L, 4L, 13L,
1L, 1L, 1L, 1L), .Label = c("", "Actin filaments;Cytosol",
"Cell Junctions;Plasma membrane", "Cytosol", "Cytosol;Mitochondria;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleoli;Nucleus", "Cytosol;Nucleoplasm;Plasma membrane",
"Golgi apparatus", "Microtubules", "Mitochondria", "Nucleoplasm",
"Plasma membrane", "Vesicles"), class = "factor"), Approved = structure(c(3L,
1L, 5L, 12L, 1L, 1L, 6L, 4L, 1L, 1L, 17L, 1L, 8L, 1L, 1L,
1L, 1L, 7L, 13L, 1L, 16L, 1L, 15L, 1L, 1L, 1L, 14L, 1L, 1L,
15L, 17L, 18L, 11L, 1L, 17L, 1L, 1L, 1L, 1L, 1L, 13L, 2L,
13L, 15L, 13L, 9L, 17L, 10L, 5L), .Label = c("", "Cell Junctions",
"Centrosome;Cytosol;Nuclear membrane", "Centrosome;Cytosol;Vesicles",
"Cytosol", "Cytosol;Nuclear membrane", "Cytosol;Nucleoli",
"Cytosol;Nucleoli;Plasma membrane", "Cytosol;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleus", "Endosomes", "Lipid droplets", "Mitochondria",
"Nucleoli fibrillar center", "Nucleoplasm", "Nucleoplasm;Vesicles",
"Nucleus", "Vesicles"), class = "factor"), Uncertain = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "Cytosol;Plasma membrane", "Nucleoli"
), class = "factor")), .Names = c("Gene_symbol", "Validated",
"Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-49L))
So the ideal output would look like this figure or, if you prefer, dput():
structure(list(Gene_symbol = structure(1:29, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM"), class = "factor"),
Validated = c(NA, 1L, NA, NA, 1L, 1L, NA, NA, 1L, NA, NA,
NA, NA, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, 1L), Supported = c(NA, 1L, NA, NA, NA, NA, NA, NA,
4L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, NA, NA, 3L, NA, 2L, NA,
1L, NA, 1L, 2L, NA, NA), Approved = c(3L, NA, 1L, 1L, NA,
NA, 2L, 3L, NA, NA, 1L, NA, 3L, NA, NA, NA, NA, 2L, 1L, NA,
2L, NA, 1L, NA, NA, NA, 1L, NA, NA), Uncertain = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Gene_symbol",
"Validated", "Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-29L))
For the most part in each column it's a string separated by ";" however, in some cases their are terms like "Nucleoli fibrillar center" or "Lipid droplets" which are separated by spaces and should be counted as one word/term
I've found examples of counting the number of words in a string in R where:
d <- "foo,bar,fun"
length(strsplit(d,",")[[1]]
class(d)
But this only works on the "character" class and not "data.frame".
Can anyone suggest how to do this in R?
Many thanks!
We can use str_count. Loop over the columns except the first one (lapply(df[-1], ..), get the count of ; add 1 to it, check for cases where there is empty string and replace those elements with NA
library(stringr)
df[-1] <- lapply(df[-1], function(x) (str_count(x, ";") + 1) * NA^(as.character(x) == ""))
A solution using base:
result_df <- data.frame(t(apply(df,1,function(x){
c(x[1],sapply(strsplit(as.character(x[-1]),";"),length))
})), stringsAsFactors = F)
names(result_df) <- c("Gene_symbol", "Validated", "Supported", "Approved", "Uncertain")

Retain only unique/distinct columns for each row of an input table

I have a very large data frame (nrow=~273,000) which I've subset as an example below: Each row is a protein name(s) and has various numbers of columns that lists the subcellular structures in which they can be found in human cells. 1) I would like to remove duplicate entries for each row and am struggling with this (code below). 2) I would then like to be able to count how many columns (subcellular structures) each gene can be found in.
Background: I got this data from Uniprot and cleaned it up as best as I could using regex but there are still some cases where there are rows with duplicate entries (e.g. FMR1 lists Chromosome 2x, Cytoplasm 3x and Plasma Membrane 2x - furthermore there are some blank columns in between them)
dput(df1)
structure(list(FMR1 = structure(c(41L, 3L, 17L, 63L, 16L, 24L,
35L, 33L, 52L, 6L, 49L, 5L, 71L, 72L, 42L, 58L, 22L, 20L, 19L,
80L, 9L, 51L, 66L, 64L, 23L, 14L, 60L, 45L, 28L, 54L, 7L, 30L,
29L, 44L, 53L, 8L, 69L, 79L, 10L, 11L, 26L, 37L, 39L, 40L, 82L,
73L, 18L, 21L, 27L, 47L, 4L, 46L, 1L, 13L, 36L, 70L, 74L, 67L,
78L, 77L, 61L, 62L, 31L, 56L, 34L, 57L, 25L, 81L, 75L, 59L, 2L,
65L, 55L, 38L, 50L, 68L, 32L, 12L, 43L, 15L, 48L, 76L), .Label = c("AAMP",
"ADCY10 SAC", "AIMP1 EMAP2 SCYE1", "ANTXR2 CMG2", "APBB1 FE65 RIR",
"APC DP2", "APLP1", "ARHGAP26 GRAF KIAA0621 OPHN1L", "ARL4A ARL4",
"ATP6V0D1 ATP6D VPATPD", "ATP6V1D ATP6M VATD", "AZIN2 ADC KIAA1945 ODCP",
"CACNB2 CACNLB2 MYSB", "CAMK2D CAMKD", "CDCA8 PESCRG3", "CDK1 CDC2 CDC28A CDKN1 P34CDC2",
"CEMIP KIAA1199", "CIB1 CIB KIP PRKDCIP", "CLTA", "CLTB", "CMTM8 CKLFSF8",
"DMD", "DSP", "ECT2", "EHD2 PAST2", "ENTPD2 CD39L1", "ERBB2 HER2 MLN19 NEU NGL",
"EVPL", "FCHO1 KIAA0290", "FCHO2", "FGR SRC2", "GPER1 CEPR CMKRL2 DRY12 GPER GPR30",
"HDAC6 KIAA0901 JM21", "ITCH", "ITGB1BP1 ICAP1", "KCTD7", "KIFC3",
"MFN1", "MISP C19orf21", "MYOT TTID", "NGDN C14orf120", "NISCH IRAS KIAA0975",
"NR1D1 EAR1 HREV THRAL", "PGM5 PGMRP", "PKP4", "PLA2G6 PLPLA9",
"PNKD KIAA1184 MR1 TAHCCP2 FKSG19 UNQ2491/PRO5778", "POP7 RPP20",
"PPL KIAA0568", "PRDX3 AOP1", "PTOV1 ACID2 PP642 UNQ6127/PRO20092",
"PTPN23 KIAA1471", "PTPRE", "PTPRR ECPTP PTPRQ", "RAB13 GIG4",
"RAB23 HSPC137", "RAB29 RAB7L1", "RAB30", "RAB38", "RAB40AL RLGP",
"RAB8A MEL RAB8", "RAB9A RAB9", "RACGAP1 KIAA1478 MGCRACGAP",
"RAP1B OK/SW-cl", "RGS8", "RPSA LAMBR LAMR1", "SGIP1", "SHMT2",
"SHROOM3 KIAA1481 SHRML MSTP013", "SLC28A3 CNT3", "SNTA1 SNT1",
"SNTB1 SNT2B1", "SNX11", "SNX12", "STOM BND7 EPB72", "TEX10 L18 Nbla10363",
"TNFRSF8 CD30 D1S166E", "TNS4 CTEN PP14434", "TRIM72 MG53", "USP6 HRP1 TRE2",
"VCL", "YES1 YES"), class = "factor"), Nucleus = structure(c(3L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Mitochondrion ", "Nucleus", "Nucleus ", "Plasma membrane",
"Plasma membrane "), class = "factor"), Chromosome = structure(c(1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Chromosome", "Cytoplasm", "Cytoplasm "), class = "factor"),
Chromosome.1 = structure(c(4L, 5L, 7L, 5L, 14L, 12L, 20L,
18L, 5L, 20L, 20L, 2L, 1L, 1L, 8L, 10L, 19L, 1L, 1L, 8L,
16L, 16L, 17L, 19L, 20L, 21L, 15L, 13L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 9L, 10L,
16L, 16L, 16L, 22L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 11L,
7L, 14L, 9L, 17L, 11L, 9L, 2L, 6L, 6L, 17L, 18L, 10L, 1L,
1L, 17L, 19L, 19L, 1L, 3L, 5L, 1L), .Label = c("", " ", "Chromosome",
"Cytoplasm ", "Cytoplasmic vesicle", "Cytoplasmic vesicle ",
"Endoplasmic reticulum", "Endosome", "Endosome ", "Golgi apparatus",
"Golgi apparatus ", "Midbody", "Midbody ", "Mitochondrion",
"Mitochondrion ", "Nucleus", "Nucleus ", "Perikaryon ",
"Plasma membrane", "Plasma membrane ", "Sarcoplasmic reticulum ",
"Secreted"), class = "factor"), Cytoplasm = structure(c(1L,
15L, 13L, 10L, 1L, 13L, 1L, 1L, 5L, 2L, 11L, 1L, 1L, 1L,
5L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 14L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 9L, 2L, 3L, 6L, 7L, 2L, 1L, 2L, 4L, 11L, 12L,
5L, 1L, 1L, 1L, 7L, 3L, 1L, 2L, 2L, 2L), .Label = c("", " ",
"Cytoplasmic vesicle", "Endoplasmic reticulum", "Endosome",
"Endosome ", "Golgi apparatus", "Golgi apparatus ", "Golgi appartus",
"Midbody", "Mitochondrion ", "Nucleus ", "Plasma membrane",
"Plasma membrane ", "Secreted "), class = "factor"), Cytoplasm.1 = structure(c(1L,
4L, 7L, 7L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
6L, 3L, 2L, 1L, 1L, 1L), .Label = c("", " ", "Endoplasmic reticulum",
"Endoplasmic reticulum ", "Endosome", "Mitochondrion", "Plasma membrane"
), class = "factor"), Cytoplasmic.vesicle = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Golgi apparatus"
), class = "factor"), Perikaryon = structure(c(2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L), .Label = c("", " ", "Golgi apparatus"), class = "factor"),
X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("",
"Cytoplasmic granule"), class = "factor"), X.1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Perikaryon"), class = "factor"),
X.2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), X.3 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Plasma.membrane = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Plasma.membrane.1 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("FMR1", "Nucleus", "Chromosome", "Chromosome.1",
"Cytoplasm", "Cytoplasm.1", "Cytoplasmic.vesicle", "Perikaryon",
"X", "X.1", "X.2", "X.3", "Plasma.membrane", "Plasma.membrane.1"
), class = "data.frame", row.names = c(NA, -82L))
I've tried getting only unique columns for each row with no luck, for example:
unique(df1) # Original data with repeats removed
dplyr::distinct(df1) # Retain only unique/distinct rows from an input tb
I think the problem is that the above functions are looking for row names which are identical which is not what I want. I want distinct columns for each row. I was thinking of using the melt function but since there is an odd number of columns for each row this won't work.
I would expect the output to look like this newDF
structure(list(FMR1 = structure(c(7L, 1L, 3L, 9L, 2L, 4L, 6L,
5L, 8L), .Label = c("AIMP1 EMAP2 SCYE1", "CDK1 CDC2 CDC28A CDKN1 P34CDC2",
"CEMIP KIAA1199", "ECT2", "HDAC6 KIAA0901 JM21", "ITGB1BP1 ICAP1",
"NGDN C14orf120", "PTPN23 KIAA1471", "RACGAP1 KIAA1478 MGCRACGAP"
), class = "factor"), Nucleus = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Nucleus", "Nucleus "), class = "factor"),
Chromosome = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Chromosome", "Cytoplasm"), class = "factor"),
Cytoplasmic.vesicle = structure(c(1L, 8L, 2L, 4L, 5L, 4L,
7L, 6L, 3L), .Label = c("Cytoplasm ", "Endoplasmic reticulum",
"Endosome", "Midbody", "Mitochondrion", "Perikaryon ", "Plasma membrane ",
"Secreted "), class = "factor"), Perikaryon = structure(c(1L,
2L, 3L, 3L, 1L, 3L, 1L, 1L, 1L), .Label = c("", "Endoplasmic reticulum ",
"Plasma membrane"), class = "factor"), Plasma.membrane = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("FMR1", "Nucleus",
"Chromosome", "Cytoplasmic.vesicle", "Perikaryon", "Plasma.membrane"
), class = "data.frame", row.names = c(NA, -9L))
From here I would like to get a rowSums(df1) so I was thinking of coercing each term to a number (e.g. Cytoplasmic vesicle=1, Nucleus=1, Endoplasmic reticiulum=1, etc.) but run into a problem on this dummy-dataset.
df2 <- as.numeric(newDF)
Error: (list) object cannot be coerced to type 'double'
df2 <- as.numeric(newDF[,2:n])
Error in 2:n : NA/NaN argument
Thank you for your help.
EDIT
I would like to get a count for each row of how many unique columns in the newDF like so:
FMR1 5
NGDN C14orf120 3
AIMP1 EMAP2 SCYE1 4
CEMIP KIAA1199 4
RACGAP1 KIAA1478 MGCRACGAP 4
CDK1 CDC2 CDC28A CDKN1 P34CDC2 3
ECT2 4
ITGB1BP1 ICAP1 3
HDAC6 KIAA0901 JM21 3
PTPN23 KIAA1471 3
This may be one way to go. Since your expected result is a character vector, I cannot visualize the final output. Yet, you said you want to check how many columns each protein appears in in the data. I hope the outcome I have is what you are after.
First, I converted all columns to character. Then, I converted the data to long format one using gather(). For each subcellular structure group (i.e., subcellular), I added row indices (e.g., 1 means the 1st row in your original data), and trim white space. Then, remove any rows with NA in protein. Remove any rows with "" and " ". Now tidying up is done. For each row (i.e., row.index), remove duplicated protein types.
Ungroup the data, and finally count how many columns each protein appears (i.e., sucellular structure). Basically, you want to count how many times each protein appear in the data set by this time.
With your sample data, I got the following result. But I am not sure if this is what you want. (I am off to bed now. So I cannot help you for some hours. If anybody can jump in, please do so.)
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(protein, sort = TRUE)
# protein n
# <chr> <int>
# 1 Cytoplasm 82
# 2 Plasma membrane 70
# 3 Nucleus 25
# 4 Endosome 9
# 5 Mitochondrion 9
# 6 Cytoplasmic vesicle 8
# 7 Golgi apparatus 7
# 8 Endoplasmic reticulum 5
# 9 Midbody 3
#10 Perikaryon 3
# ... with 87 more rows
Given jjl"s comment, I did the following. Instead of counting how many columns each protein appears in, I counted how many protein names exist for each row.
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(row.index)
# row.index n
# <int> <int>
# 1 1 4
# 2 2 6
# 3 3 5
# 4 4 6
# 5 5 4
# 6 6 5
# 7 7 4
# 8 8 4
# 9 9 5
#10 10 3
# ... with 72 more rows
EDIT
If you want to remove the 1st column (i.e, FMR1), you can do that by filtering that column. I added filter(subcellular != "FMR1") to my code before I used count() at the end.
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
filter(subcellular != "FMR1") %>%
count(row.index)
# A tibble: 9 x 2
# row.index n
# <int> <int>
#1 1 3
#2 2 4
#3 3 4
#4 4 4
#5 5 3
#6 6 4
#7 7 3
#8 8 3
#9 9 3

Using geom_text & facet_wrap: Error in unit(x, default.units) : 'x' and 'units' must have length > 0

I am experiencing this weird error..
Some test data:
library(ggplot2)
library(dplyr)
test <- structure(list(group = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L), .Label = c("G", "N", "P"), class = "factor"),
var = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 4L, 4L, 2L, 2L,
2L, 3L, 2L, 2L, 2L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L, 1L, 1L,
2L, 1L, 1L, 1L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 4L, 4L, 4L, 1L,
1L, 2L, 2L, 2L, 3L), .Label = c("a", "b", "c", "d", "e"), class = "factor"),
group2 = structure(c(3L, 1L, 3L, 3L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 2L,
3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L,
4L, 1L, 1L, 2L, 3L, 4L), .Label = c("O", "P", "Q", "R"), class = "factor"),
cor = c(0.270075198428616, 0.262097140096646, -0.331312784846655,
-0.343984945812309, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA)), row.names = c(NA, -45L), .Names = c("group", "var",
"group2", "cor"), class = "data.frame")
I'd like to make this plot:
test %>%
ggplot(., aes(x=group2, y=cor)) +
geom_bar(stat="identity", position="dodge", aes(fill=var, group=var)) +
geom_text(aes(label = round(cor,2),
vjust = ifelse(cor >= 0, 0, 1),
group=var),
position = position_dodge(width=1)) +
theme_bw(base_size=18) +
facet_wrap(~group, scales="free_x")
.. which results in: Error in unit(x, default.units) : 'x' and 'units' must have length > 0
However, without geom_text it works:
And also only using rows 1:40 works:
test %>% slice(1:40) %>%
ggplot(., aes(x=group2, y=cor)) +
geom_bar(stat="identity", position="dodge", aes(fill=var, group=var)) +
geom_text(aes(label = round(cor,2),
vjust = ifelse(cor >= 0, 0, 1),
group=var),
position = position_dodge(width=1)) +
theme_bw(base_size=18) +
facet_wrap(~group, scales="free_x")
So up to row 40 there are only two levels for group, and from row 41 on there is a third level. But how can that cause this error? Or is there something else I don't see?
The error also disappears if you erase facet_wrap or if not all of your P-group is NA (for example change the last entry for cor in 0.2). So apparently, if a facet is empty (contains only NA's), geom_text can't handle that.
If you really want to include the empty facet, here's a workaround:
1. replace at least 1 (or all?) NA's in the empty facets with a 0. The bar will have 0 length, so no problem there
2. Now you have a zero label in your graph. By using alpha=ifelse(cor == 0, 0, 1) you'll make that label fully transparent.
Code:
library(ggplot2)
library(dplyr)
test <- structure(list(group = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L), .Label = c("G", "N", "P"), class = "factor"),
var = structure(c(1L, 1L, 2L, 3L, 4L, 4L, 4L, 4L, 2L, 2L,
2L, 3L, 2L, 2L, 2L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L, 1L, 1L,
2L, 1L, 1L, 1L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 4L, 4L, 4L, 1L,
1L, 2L, 2L, 2L, 3L), .Label = c("a", "b", "c", "d", "e"), class = "factor"),
group2 = structure(c(3L, 1L, 3L, 3L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 2L,
3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 3L,
4L, 1L, 1L, 2L, 3L, 4L), .Label = c("O", "P", "Q", "R"), class = "factor"),
cor = c(0.270075198428616, 0.262097140096646, -0.331312784846655,
-0.343984945812309, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0)), row.names = c(NA, -45L), .Names = c("group", "var",
"group2", "cor"), class = "data.frame")
test %>%
ggplot(., aes(x=group2, y=cor)) +
geom_bar(stat="identity", position="dodge", aes(fill=var, group=var)) +
geom_text(aes(label = round(cor,2),
vjust = ifelse(cor >= 0, 0, 1),
group=var, alpha=ifelse(cor == 0, 0, 1)),
position = position_dodge(width=1)) +
theme_bw(base_size=18) +
facet_wrap(~group, scales="free_x")
If you want to remove the legend of geom_text, simply add show_guide = FALSE.
EDIT: If the transparency doesn't work right, #beetroot came up with this nice alternative: replace alpha=ifelse(cor == 0, 0, 1) with size=ifelse(cor == 0, NA, 3))

Merge / match two variables with one group of variables from another dataframe

I have two data.frames df.1 and df.2 that I would merge or otherwise select data from to create a new data.frame. df.1 contains information about each individual (ID), sampling event (Event), Site and sample number (Sample). The tricky part for me is that Site and the corresponding Sample for each ID-Event pairing is different. For example, F3-3 has Site "plum" for Sample "1" and M6-3 has Site "pear" for Sample "1".
df.2 has Sample1 and Sample2 which corresponds to the Sample information in df.1 by way of the ID-Event pairing.
I'd like to match/merge the information between these two data.frames. Essentially, get the "word" from Site in df.1 that matches the Sample number. An example (df.3) is below.
Each ID-Event pairing will only have one Site and corresponding Sample (e.g. "Apple" will correspond to "1" not to "1" and "4"). I know I could use merge if I was only matching, for example, Sample1 or Sample2 I am not sure how to do this with both to populate Site1 and Site2 with the correctly matched word.
df.1 <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("F1",
"F3", "M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "M"), class = "factor"), Event = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L), Site = structure(c(1L, 3L, 9L, 7L, 8L, 10L,
2L, 6L, 4L, 5L, 1L, 9L, 7L, 8L, 10L, 5L, 10L, 2L, 6L, 4L, 5L,
1L, 9L, 2L, 6L, 4L, 5L, 1L, 8L, 3L, 10L, 4L, 2L, 6L, 4L, 5L,
1L), .Label = c("Apple", "Banana", "Grape", "Guava", "Kiwi",
"Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
Sample = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L)), .Names = c("ID",
"Sex", "Event", "Site", "Sample"), class = "data.frame", row.names = c(NA,
-37L))
#
df.2 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F1",
"M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
Event = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1",
"Sample2", "V1", "V2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA,
-12L))
#
df.3 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), Site1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Apple",
"Banana"), class = "factor"), Site2 = structure(c(2L, 8L, 6L,
7L, 9L, 1L, 5L, 3L, 4L, 5L, 3L, 4L), .Label = c("Banana", "Grape",
"Guava", "Kiwi", "Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L), .Label = c("F1", "M6"), class = "factor"), Sex = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), Event = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1", "Sample2",
"V1", "V2", "Site1", "Site2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA, -12L))
Two merges should do it:
first <- merge(df.2, unique(df.1[,3:5]), by.x=c("Sample1","Event"), by.y=c("Sample","Event"), all.x=TRUE)
second <- merge(first, unique(df.1[,3:5]),by.x=c("Sample2","Event"), by.y=c("Sample","Event"), all.x=TRUE)
print(second)
Sample2 Event Sample1 V1 V2 ID Sex Site.x Site.y
1 10 1 1 0.000 0.001 F1 F Apple Kiwi
2 2 1 1 0.120 0.107 F1 F Apple Grape
3 3 1 1 0.497 0.273 F1 F Apple Pear
4 3 3 2 0.001 0.107 M6 M Banana Mango
5 4 1 1 0.715 0.595 F1 F Apple Orange
6 4 3 2 0.000 0.273 M6 M Banana Guava
7 5 1 1 0.000 0.000 F1 F Apple Peach
8 5 3 2 0.829 0.595 M6 M Banana Kiwi
9 6 1 1 0.001 0.004 F1 F Apple Plum
10 7 1 1 0.000 0.000 F1 F Apple Banana
11 8 1 1 0.829 0.547 F1 F Apple Mango
12 9 1 1 0.000 0.001 F1 F Apple Guava

Resources