Related
I have a folder which serves as a container for a standardized report from a system. This report is run on a daily basis. However, the report may require re-run for a certain date or range of dates depending on user preferences and asks. Thus file content may change significantly.
I would like to create a script that would group the unique dates together in one dataframe based on the latest run time, and another dataframe for the dates that are being revised.
Here is a simplified version of the table:
structure(list(Source = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), Date = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("11-Feb-20", "12-Feb-20"
), class = "factor"), FarmType = structure(c(3L, 4L, 5L, 1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L), .Label = c("AJSKJA",
"ASKJKA", "GHDGH", "KLKIUK", "KLSAKJ"), class = "factor"), FarmName = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L), .Label = c("",
"JJHGH", "JKJKK", "JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
15.26566, 1.230474165, 13.9407486), RunDate = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-14L))
Please note that the number of columns does not change, however, after each re-run the number of rows may increase/decrease.
The idea is -- the first group of data that is based on the most recent run would represent the up-to-date information (corrections, revisions, etc.), while the second group essentially looks at what is being revised and how the numbers and data are changing.
Expected output for the first group:
structure(list(Source = c(3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L),
Date = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("11-Feb-20",
"12-Feb-20"), class = "factor"), FarmType = structure(c(3L,
4L, 5L, 1L, 3L, 4L, 5L, 1L, 2L), .Label = c("AJSKJA", "ASKJKA",
"GHDGH", "KLKIUK", "KLSAKJ"), class = "factor"), FarmName = structure(c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L), .Label = c("", "JJHGH",
"JKJKK", "JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
15.26566, 1.230474165, 13.9407486, 13.04144378, 1.230474165,
1.230474165, 13.9407486, 13.9407486), RunDate = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
Expected output for the second group:
structure(list(Source = c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L),
Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "11-Feb-20", class = "factor"),
FarmType = structure(c(3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L
), .Label = c("AJSKJA", "ASKJKA", "GHDGH", "KLKIUK", "KLSAKJ"
), class = "factor"), FarmName = structure(c(1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L), .Label = c("", "JJHGH", "JKJKK",
"JUISO", "SDLLS"), class = "factor"), Perform = c(13.04144378,
1.230474165, 1.230474165, 13.9407486, 13.9407486, 13.04144378,
15.26566, 1.230474165, 13.9407486), RunDate = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("02/14/2020",
"02/15/2020"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
Thank you for your time. Please let me know if you have questions.
We could group by 'Date' and filter those groups where the 'RunDate' is the latest after converting to Date class
library(lubridate)
library(dplyr)
new1 <- df1 %>%
group_by(Date) %>%
filter(mdy(RunDate) == max(mdy(RunDate)))
and for the second set, we can check if the number of distinct elements of 'RunDate' is more than 1
new2 <- df1 %>%
group_by(Date) %>%
filter(n_distinct(RunDate) > 1)
I have a data frame of 2511 rows and 6 columns with candy and color items. Please see the first 15 rows as below:
structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
I don`t know how can I count in new columns only the kind of candy that each row has. This first line as an expected ouput of the resulting data frame:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
I'm stuck and I'd appreciate a little help.
you can use apply function to apply grepl function by row for the initial data frame. Then you use sapply to iterate through four ingridients you indicated. Then use cbind to concatentate the initial data frame and the data frame with ingedients into one. Please see the code below:
# initialize data frame
df <- structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
# counting ingridients
ingridients <- c("dulce", "miel", "chocolate", "cookie")
x <- sapply(ingridients, function(y) apply(df, 1, function(x) sum(grepl(y, x))))
df_res <- cbind(df, x)
head(df_res)
Output:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
2 2 {dulce1_rojo dulce2_verde chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{miel21_amarillo} 2 1 2 1
3 3 {dulce1_rojo dulce2_verde miel21_amarillo chocolate32_violeta cookie32_violeta} >{chocolate2l_amarillo} 2 1 2 1
4 4 {dulce1_rojo dulce2_verde miel21_amarillo chocolate2l_amarillo cookie32_violeta} >{chocolate32_violeta} 2 1 2 1
5 5 {miel30_azul chocolate2l_amarillo chocolate30_azul cookie30_azul cookie2l_amarillo} >{miel21_amarillo} 0 2 2 2
6 6 {miel21_amarillo miel30_azul chocolate30_azul cookie30_azul cookie2l_amarillo} >{chocolate2l_amarillo} 0 2 2 2
This question already has answers here:
Count number of rows within each group
(17 answers)
Closed 5 years ago.
I have a dataframe like this:
df <- structure(list(col1 = structure(c(1L, 1L, 2L, 3L, 1L, 3L, 1L,
3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 4L), .Label = c("stock1",
"stock2", "stock3", "stock4"), class = "factor"), col2 = structure(c(4L,
5L, 7L, 6L, 5L, 5L, 5L, 6L, 6L, 8L, 8L, 4L, 3L, 3L, 1L, 2L, 3L
), .Label = c("comapny1", "comapny1+comapny4", "comapny4", "company1",
"company2", "company2+company1", "company3", "company4"), class = "factor"),
col3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("predictor1", "predictor2"
), class = "factor")), .Names = c("col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-17L))
I would like to take the frequency from the three columns.
Expected output
df2 <- structure(list(col1 = structure(c(1L, 1L, 1L, 2L, 4L, 1L, 1L,
3L, 3L, 1L, 2L, 1L), .Label = c("stock1", "stock2", "stock3",
"stock4"), class = "factor"), col2 = structure(c(1L, 2L, 3L,
3L, 3L, 4L, 5L, 5L, 6L, 6L, 7L, 8L), .Label = c("comapany1",
"comapany1+comapany4", "comapany4", "company1", "company2", "company2+company1",
"company3", "company4"), class = "factor"), col3 = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("predictor1",
"predictor2"), class = "factor"), frequency = c(1L, 1L, 1L, 1L,
1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L)), .Names = c("col1", "col2",
"col3", "frequency"), class = "data.frame", row.names = c(NA,
-12L))
How is it possible to make it?
We can use count
library(dplyr)
count(df, col1, col2, col3)
# A tibble: 12 x 4
# col1 col2 col3 n
# <fctr> <fctr> <fctr> <int>
# 1 stock1 comapny1 predictor2 1
# 2 stock1 comapny1+comapny4 predictor2 1
# 3 stock1 comapny4 predictor2 1
# 4 stock1 company1 predictor1 2
# 5 stock1 company2 predictor1 3
# 6 stock1 company2+company1 predictor1 1
# 7 stock1 company4 predictor1 2
# 8 stock2 comapny4 predictor2 1
# 9 stock2 company3 predictor1 1
#10 stock3 company2 predictor1 1
#11 stock3 company2+company1 predictor1 2
#12 stock4 comapny4 predictor2 1
Or with data.table
library(data.table)
setDT(df)[, .N, .(col1, col2, col3)]
I have 2 data frames for 2 stacks that gives information about potential emission. One data frame gives the time frame of what hours the system turn on and off for 4 seasons. Each season start on specific date. The 2nd file give me the details of the stack.
I am trying with some sample file to test how to do this and so far I have managed to create a function following stack overflow example that allow me to create a data frame with the dates that I would like and a column with seasons for each date. I am really struggling now with the programming concept to understand how do I combine the 3 data frames to create the output template that I am trying to set up.
To show you an example my sample input are:
Stack_info File:
example seasonal Profile that shows when the system is on or off:
and the output I am after should create data frames for each year in the following format (only the black font and the red text to just explain what the values are):
What is the most difficult I am finding is that my output files for each year will have a unique first Row and the 2nd row will repeat for each pollutant. and from 3rd row the hourly data for all 8760 hours. This need to repeat for the next pollutant.
So far I have managed to create a function that helps me to assign season to each day of the year. For example:
#function to create seasons
d = function(month_day) which(lut$month_day == month_day)
lut = data.frame(all_dates = as.POSIXct("2012-1-1") + ((0:365) * 3600 * 24),
season = NA)
lut = within(lut, { month_day = strftime(all_dates, "%b-%d") })
lut[c(d("Jan-01"):d("Mar-15"), d("Nov-08"):d("Dec-31")), "season"] = "winter"
lut[c(d("Mar-16"):d("Apr-30")), "season"] = "spring"
lut[c(d("May-01"):d("Sep-27")), "season"] = "summer"
lut[c(d("Sep-28"):d("Nov-07")), "season"] = "autumn"
rownames(lut) = lut$month_day
## create date data frame and assign seasons
dates = data.frame(dates =seq(as.Date('2010-01-01'),as.Date('2012-12-31'),by = 1))
dates = within(dates, {
season = lut[strftime(dates, "%b-%d"), "season"]
})
This gives me a dates data frame and my other 2 samples data frames are (as shown in the image):
structure(list(`Source no` = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Source = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Stack 1", "Stack 2"), class = "factor"),
Period = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), Day = structure(c(2L,
6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L, 7L, 5L, 1L, 3L, 4L, 2L, 6L,
7L, 5L, 1L, 3L, 4L), .Label = c("Fri", "Mon", "Sat", "Sun",
"Thu", "Tue", "Wed"), class = "factor"), `Spring On` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 15L,
15L, 15L, 15L, 15L, 15L, 15L), `Spring Off` = c(23L, 23L,
23L, 23L, 23L, 23L, 23L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 18L,
18L, 18L, 18L, 18L, 18L, 18L), `Summer On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Summer Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Autumn Off` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "off", class = "factor"), `Winter On` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "off"), class = "factor"),
`Winter Off` = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("23",
"off"), class = "factor")), .Names = c("Source no", "Source",
"Period", "Day", "Spring On", "Spring Off", "Summer On", "Summer Off",
"Autumn On", "Autumn Off", "Winter On", "Winter Off"), class = "data.frame", row.names = c(NA,
-21L)) -> profile
structure(list(SNAME = structure(1:2, .Label = c("Stack 1", "Stack 2"
), class = "factor"), ISVARY = c(1L, 4L), VELVOL = c(1L, 4L),
TEMPDENS = c(0L, 2L), `DUM 1` = c(999L, 999L), `DUM 2` = c(999L,
999L), NPOL = c(2L, 2L), `EXIT VEL` = c(26.2, 22.4), TEMP = c(341L,
328L), `STACK DIAM` = c(1.5, 2.5), W = c(0L, 15L), Nox = c(39,
33.3), Sox = c(15.5, 17.9)), .Names = c("SNAME", "ISVARY",
"VELVOL", "TEMPDENS", "DUM 1", "DUM 2", "NPOL", "EXIT VEL", "TEMP",
"STACK DIAM", "W", "Nox", "Sox"), class = "data.frame", row.names = c(NA,
-2L)) -> stack_info
If anyone could give me any guidance of how I can proceed with the programming part would be really useful as I am just not sure how I can approach this to create separate output files as data frame for year 2010, 2011 and 2012.
The way your data is organised isn't ideal for processing. Maybe you have a look at Hadley Wickhams papar about tidy data.
According to your desired output you need a dataframe with the number of lines equal to the number of hours a specific machine (stack n) is switched on. Therefore I suggest you create a dataframe containing every hour of a given year:
d.out = data.frame(dates = seq(from=as.POSIXct("2010-01-01"), by=3600, to= as.POSIXct("2010-12-31")))
d.out$year = as.numeric(format(d.out$dates, "%Y"))
d.out$month = as.numeric(format(d.out$dates, "%m"))
d.out$day = as.numeric(format(d.out$dates, "%d"))
d.out$hour = as.numeric(format(d.out$dates, "%H"))
d.out$weekday = as.character(format(d.out$dates, "%a"))
d.out$doj = as.numeric(format(d.out$dates, "%j"))
d.out$season = "Winter"
d.out$season[d.out$doj >= 75 & d.out$doj < 121] = "Spring"
d.out$season[d.out$doj >= 121 & d.out$doj < 271] = "Summer"
d.out$season[d.out$doj >= 271 & d.out$doj < 312] = "Autumn"
The goal is to join this dataframe with your profile dataframe. Before joining, the profile-df has to be rearranged:
library(dplyr)
library(tidyr)
profile_new =
profile %>%
gather(season, hour, -c(`Source no`, Source, Period, Day)) %>%
extract(season, c("season", "status"), "(\\w+?)\\s(\\w+)") %>%
filter(hour != "off") %>%
mutate(Day = as.character(Day), hour=as.numeric(hour)) %>%
spread(status, hour)
Now it's easy to join the three dataframes to put together all the information you need to create your output:
d.out %>%
inner_join(profile_new, by=c("weekday"="Day", "season"="season")) %>%
group_by(Source, dates, year, day, weekday, season, hour) %>%
summarise(status = any(hour >= On & hour <= Off)) %>%
inner_join(stack_info, by=c("Source"="SNAME")) %>%
mutate(Nox = ifelse(status, Nox, 0),
Sox = ifelse(status, Sox, 0)) %>%
arrange(Source, year, dates, hour) %>%
select(Source, year, day, weekday, season, hour, `EXIT VEL`, TEMP, `STACK DIAM`, W, Nox, Sox)
Obviously it's not quite the format you posted. From here you could write your dataframe to a csv (stack by stack by using append = TRUE).
I have two data.frames df.1 and df.2 that I would merge or otherwise select data from to create a new data.frame. df.1 contains information about each individual (ID), sampling event (Event), Site and sample number (Sample). The tricky part for me is that Site and the corresponding Sample for each ID-Event pairing is different. For example, F3-3 has Site "plum" for Sample "1" and M6-3 has Site "pear" for Sample "1".
df.2 has Sample1 and Sample2 which corresponds to the Sample information in df.1 by way of the ID-Event pairing.
I'd like to match/merge the information between these two data.frames. Essentially, get the "word" from Site in df.1 that matches the Sample number. An example (df.3) is below.
Each ID-Event pairing will only have one Site and corresponding Sample (e.g. "Apple" will correspond to "1" not to "1" and "4"). I know I could use merge if I was only matching, for example, Sample1 or Sample2 I am not sure how to do this with both to populate Site1 and Site2 with the correctly matched word.
df.1 <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("F1",
"F3", "M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "M"), class = "factor"), Event = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L), Site = structure(c(1L, 3L, 9L, 7L, 8L, 10L,
2L, 6L, 4L, 5L, 1L, 9L, 7L, 8L, 10L, 5L, 10L, 2L, 6L, 4L, 5L,
1L, 9L, 2L, 6L, 4L, 5L, 1L, 8L, 3L, 10L, 4L, 2L, 6L, 4L, 5L,
1L), .Label = c("Apple", "Banana", "Grape", "Guava", "Kiwi",
"Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
Sample = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L)), .Names = c("ID",
"Sex", "Event", "Site", "Sample"), class = "data.frame", row.names = c(NA,
-37L))
#
df.2 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F1",
"M6"), class = "factor"), Sex = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
Event = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1",
"Sample2", "V1", "V2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA,
-12L))
#
df.3 <- structure(list(Sample1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), Sample2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L), V1 = c(0.12, 0.497, 0.715, 0, 0.001, 0, 0.829, 0,
0, 0.001, 0, 0.829), V2 = c(0.107, 0.273, 0.595, 0, 0.004, 0,
0.547, 0.001, 0.001, 0.107, 0.273, 0.595), Site1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Apple",
"Banana"), class = "factor"), Site2 = structure(c(2L, 8L, 6L,
7L, 9L, 1L, 5L, 3L, 4L, 5L, 3L, 4L), .Label = c("Banana", "Grape",
"Guava", "Kiwi", "Mango", "Orange", "Peach", "Pear", "Plum"), class = "factor"),
ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L), .Label = c("F1", "M6"), class = "factor"), Sex = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), Event = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L)), .Names = c("Sample1", "Sample2",
"V1", "V2", "Site1", "Site2", "ID", "Sex", "Event"), class = "data.frame", row.names = c(NA, -12L))
Two merges should do it:
first <- merge(df.2, unique(df.1[,3:5]), by.x=c("Sample1","Event"), by.y=c("Sample","Event"), all.x=TRUE)
second <- merge(first, unique(df.1[,3:5]),by.x=c("Sample2","Event"), by.y=c("Sample","Event"), all.x=TRUE)
print(second)
Sample2 Event Sample1 V1 V2 ID Sex Site.x Site.y
1 10 1 1 0.000 0.001 F1 F Apple Kiwi
2 2 1 1 0.120 0.107 F1 F Apple Grape
3 3 1 1 0.497 0.273 F1 F Apple Pear
4 3 3 2 0.001 0.107 M6 M Banana Mango
5 4 1 1 0.715 0.595 F1 F Apple Orange
6 4 3 2 0.000 0.273 M6 M Banana Guava
7 5 1 1 0.000 0.000 F1 F Apple Peach
8 5 3 2 0.829 0.595 M6 M Banana Kiwi
9 6 1 1 0.001 0.004 F1 F Apple Plum
10 7 1 1 0.000 0.000 F1 F Apple Banana
11 8 1 1 0.829 0.547 F1 F Apple Mango
12 9 1 1 0.000 0.001 F1 F Apple Guava