create data frame from nested entries - r

I have a data frame test like this:
dput(test)
structure(list(X = 1L, entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
tags = structure(1L, .Label = "c(\"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\"), c(\"app1\", \"client\", \"org\", \"app1\", \"DATA_CENTER\", \"PURPOSE\", \"REGION\", \"Test\"), c(NA, \"NONE\", \"Host:Environment:test123\", \"111\", \"222\", \"GENERAL\", \"444\", \"555\")", class = "factor")), .Names = c("X",
"entityId", "displayName", "discoveredName", "firstSeenTimestamp",
"lastSeenTimestamp", "tags"), class = "data.frame", row.names = c(NA,
-1L))
There is a column called tags which should become a dataframe. I need to get rid of the first row in tags (which keep saying CONTEXTLESS, expand the second column in tags(make them columns. Lastly I need to insert the 3rd column values in tags under each expanded columns.
For example in needs to look like this:
structure(list(entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
app1 = NA, client = structure(1L, .Label = "None", class = "factor"),
org = structure(1L, .Label = "Host:Environment:test123", class = "factor"),
app1.1 = 111L, data_center = 222L, purppose = structure(1L, .Label = "general", class = "factor"),
region = 444L, test = 555L), .Names = c("entityId", "displayName",
"discoveredName", "firstSeenTimestamp", "lastSeenTimestamp",
"app1", "client", "org", "app1.1", "data_center", "purppose",
"region", "test"), class = "data.frame", row.names = c(NA, -1L
))
I need to remove the 1st vector that keeps saying "contextless", add the second vector the columns. Each 2nd vector value should be a column name. Last vector should be values of the newly added columns.

If you are willing to drop the first "row" of garbage and then do a ittle cleanup of the parse-side-effects, then this might be a good place to start:
read.table(text=gsub("\\),", ")\n", test$tags[1]), sep=",", skip=1, #drops line
header=TRUE)
c.app1 client org app1 DATA_CENTER PURPOSE REGION Test.
1 c(NA NONE Host:Environment:test123 111 222 GENERAL 444 555)
The read.table function uses the scan function which doesn't know that "c(" and ")" are meaningful. The other alternative might be to try eval(parse(text= .)) (which would know that they are enclosing vectors) on the the second and third lines, but I couldn't see a clean way to do that. I initially tried to separate the lines using strsplit, but that caused me to loose the parens.
Here's a stab at some cleanup via that addition of some more gsub operations:
read.table(text=gsub("c\\(|\\)","", # gets rid of enclosing "c(" and ")"
gsub("\\),", "\n", # inserts line breaks
test$tags[1])),
sep=",", #lets commas be parsed
skip=1, #drops line
header=TRUE) # converts to colnames
app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
1 NA NONE Host:Environment:test123 111 222 GENERAL 444 555
The reason for the added ".1" in the second instance of app1 is that R colnames in dataframes need to be unique unless you override that with check.names=FALSE

Here is a tidyverse approach
library(dplyr)
library(tidyr)
str2dataframe <- function(txt, keep = "all") {
# If you can confirm that all vectors are of the same length, then we can make them into columns of a data.frame
out <- eval(parse(text = paste0("data.frame(", as.character(txt),")")))
# rename columns as X1, X2, ...
nms <- make.names(seq_along(out), unique = TRUE)
if (keep == "all")
keep <- nms
`names<-`(out, nms)[, keep]
}
df %>%
mutate(
tags = lapply(tags, str2dataframe, -1L),
tags = lapply(tags, function(d) within(d, X2 <- make.unique(X2)))
) %>%
unnest(tags) %>%
pivot_wider(names_from = "X2", values_from = "X3")
df looks like this
> df
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp
1 1 HOST-123 server1 server1 1.59386e+12 1.60321e+12
tags
1 c("CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS"), c("app1", "client", "org", "app1", "DATA_CENTER", "PURPOSE", "REGION", "Test"), c(NA, "NONE", "Host:Environment:test123", "111", "222", "GENERAL", "444", "555")
Output looks like this
# A tibble: 1 x 14
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
<int> <fct> <fct> <fct> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 HOST-123 server1 server1 1593860000000 1603210000000 NA NONE Host:Environment:test123 111 222 GENERAL 444 555

Related

Converting empty values to NULL in R - Handling date column

I have a simple dataframe as: dput(emp)
structure(list(name = structure(1L, .Label = "Alex", class = "factor"),
job = structure(1L, .Label = "", class = "factor"), Mgr = structure(1L, .Label = "", class = "factor"),
update = structure(18498, class = "Date")), class = "data.frame", row.names = c(NA,
-1L))
I want to convert all empty rows to NULL
The simplest way to achieve is:
emp[emp==""] <- NA
Which ofcourse would have worked but I get the error for the date column as:
Error in charToDate(x) :
character string is not in a standard unambiguous format
How can I convert all other empty rows to NULL without having to deal with the date column? Please note that the actual data frame has 30000+ rows.
Try formating the date variable as character, make the change and transform to date again:
#Format date
emp$update <- as.character(emp$update)
#Replace
emp[emp=='']<-NA
#Reformat date
emp$update <- as.Date(emp$update)
Output:
name job Mgr update
1 Alex <NA> <NA> 2020-08-24
You can try type.convert like below
type.convert(emp,as.is = TRUE)
such that
name job Mgr update
1 Alex NA NA 2020-08-24
You may try this using dplyr:
library(dplyr)
df %>%
mutate_at(vars(update),as.character) %>%
na_if(.,"")
As mentioned by #Duck, you have to format the date variable as character.
afterwards you can transform it back to date if you need it:
library(dplyr)
df %>%
mutate_at(vars(update),as.character) %>%
na_if(.,"") %>%
mutate_at(vars(update),as.Date)
See if this works:
> library(dplyr)
> library(purrr)
> emp <- structure(list(name = structure(1L, .Label = "Alex", class = "factor"),
+ job = structure(1L, .Label = "", class = "factor"), Mgr = structure(1L, .Label = "", class = "factor"),
+ update = structure(18498, class = "Date")), class = "data.frame", row.names = c(NA,
+ -1L))
> emp
name job Mgr update
1 Alex 2020-08-24
> emp %>% mutate(update = as.character(update)) %>% map_df(~gsub('^$',NA, .x)) %>% mutate(update = as.Date(update)) %>% mutate(across(1:3, as.factor))
# A tibble: 1 x 4
name job Mgr update
<fct> <fct> <fct> <date>
1 Alex NA NA 2020-08-24
>

How do I unnest a nested df and use the coumn name as part of the new column name?

I realize my title is probably a little confusing. I have some JSON that is a little confusing to unnest. I am trying to use the tidyverse.
Sample Data
df <- structure(list(long_abbr = c("Team11", "BBS"), short_name = c("Ac ",
"BK"), division = c("", ""), name = c("AC Slaters Muscles", "Broken Bats"
), abbr = c("T1", "T1"), owners = list(structure(list(commissioner = 0L,
name = "Chris Liss", id = "300144F8-79F4-11EA-8F25-9AE405472731"), class = "data.frame", row.names = 1L),
structure(list(commissioner = 1L, name = "Mark Ortin", id = "90849EF6-7427-11EA-95AA-4EEEAC7F8CD2"), class = "data.frame", row.names = 1L)),
id = c("1", "2"), logged_in_team = c(NA_integer_, NA_integer_
)), row.names = 1:2, class = "data.frame")
)
# Unnest Owners Information
df <- df %>%
unnest(owners)
I get the following error since I have duplicate columns that use name.
Error: Column names `name` and `id` must not be duplicated.
Is there an easy way to unnest the columns with a naming convention that takes the prefix owners (or in my case, I'd want it to take whatever the name of the column that hold the nested df is) before the nested columns. I.E. owners.commissioner, owners.name, owners.id. I'd also be interested in solutions that use camel case, and an underscore. I.E. ownersName, or owners_name.
set the argument names_sep:
df <- structure(
list(long_abbr = c("Team11", "BBS"),
short_name = c("Ac ", "BK"),
division = c("", ""),
name = c("AC Slaters Muscles", "Broken Bats"),
abbr = c("T1", "T1"),
owners = list(
structure(list(commissioner = 0L, name = "Chris Liss",
id = "300144F8-79F4-11EA-8F25-9AE405472731"),
class = "data.frame", row.names = 1L),
structure(list(commissioner = 1L, name = "Mark Ortin",
id = "90849EF6-7427-11EA-95AA-4EEEAC7F8CD2"),
class = "data.frame", row.names = 1L)),
id = c("1", "2"),
logged_in_team = c(NA_integer_, NA_integer_)),
row.names = 1:2, class = "data.frame"
)
tidyr::unnest(df, owners, names_sep = "_")
#> # A tibble: 2 x 10
#> long_abbr short_name division name abbr owners_commissi… owners_name
#> <chr> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 Team11 "Ac " "" AC S… T1 0 Chris Liss
#> 2 BBS "BK" "" Brok… T1 1 Mark Ortin
#> # … with 3 more variables: owners_id <chr>, id <chr>, logged_in_team <int>
Created on 2020-04-26 by the reprex package (v0.3.0)
Does this solve your problem?

Is there a limit to the number of levels in R?

I am wondering if there is a limitation on the number of levels for a factor?
I am trying to restructure some curriculums from Xing. The selectable industries are around 135 different ones.
My code looks like that, as I mentioned there are 135 different industries in my actual code.
companyIndustryLevels <- c("","ACADEMIA", "ACCOUNTING", "AEROSPACE")
levels(samples[[1]]$Industry) <- companyIndustryLevels
The following combinations work fine and are selectable when filtering the list.
genderLevels <- c("M","F")
companySizeLevels <- c("","1","1-10","11-50","51-200","201-500","501-1000","1001-5000","5001-10000","10001+")
levels(samples[[1]]$Gender) <- genderLevels
levels(samples[[1]]$CompanySize) <- companySizeLevels
So the problem is, that when viewing the list, the industry column only shows factor with 1 level, not with 135 levels.
EDIT:
I am using RStudio Version 11.1.383 and R Version 3.4.3.
As you can see in the reproductable example below the other columns like "Gender", "Beschäftigungsart", "Position", "Unternehmensgroesse" also got levels.
When selecting the Filter in the View Window in RStudio I am able to filter all of the columns by their levels, except the "Industrie" column.
View(structure(
list(
ID = 1,
Gender = structure(1L, .Label = c("M",
"F"), class = "factor"),
Bildungseinrichtungen = structure(1L, .Label = "", class = "factor"),
Abschluss = structure(1L, .Label = "", class = "factor"),
Studienfach = structure(1L, .Label = "", class = "factor"),
Beschäftigungsart = structure(
1L,
.Label = c(
"",
"FULL_TIME_EMPLOYEE",
"PART_TIME_EMPLOYEE",
"INTERN",
"FREELANCER",
"OWNER",
"PARTNER",
"BOARD_MEMBER",
"VOLUNTEER"
),
class = "factor"
),
Station.Start = NA,
Station.Ende = NA,
Bezeichnung = NA,
Position = structure(
1L,
.Label = c(
"",
"STUDENT_INTERN",
"ENTRY_LEVEL",
"PROFESSIONAL_EXPERIENCED",
"MANAGER_SUPERVISOR",
"EXECUTIVE",
"SENIOR_EXECUTIVE"
),
class = "factor"
),
Unternehmen = structure(1L, .Label = "AMA", class = "factor"),
Unternehmensgroesse = structure(
1L,
.Label = c(
"",
"1",
"1-10",
"11-50",
"51-200",
"201-500",
"501-1000",
"1001-5000",
"5001-10000",
"10001+"
),
class = "factor"
),
Industrie = structure(
1L,
.Label = c(
"ACADEMIA",
"ACCOUNTING",
"AEROSPACE",
"AGRICULTURE",
"AIRLINES",
"ALTERNATIVE_MEDICINE",
"APPAREL_AND_FASHION",
"ARCHITECTURE_AND_PLANNING",
"ARTS_AND_CRAFTS",
"AUTOMOTIVE",
"BANKING",
"BIOTECHNOLOGY",
"BROADCAST_MEDIA",
"BUILDING_MATERIALS",
"BUSINESS_SUPPLIES_AND_EQUIPMENT",
"CHEMICALS",
"CIVIC_AND_SOCIAL_ORGANIZATIONS",
"CIVIL_ENGINEERING",
"CIVIL_SERVICE",
"COMPOSITES",
"COMPUTER_AND_NETWORK_SECURITY",
"COMPUTER_GAMES",
"COMPUTER_HARDWARE",
"COMPUTER_NETWORKING",
"COMPUTER_SOFTWARE",
"CONSTRUCTION",
"CONSULTING",
"CONSUMER_ELECTRONICS",
"CONSUMER_GOODS",
"CONSUMER_SERVICES",
"COSMETICS",
"DAYCARE",
"DEFENSE_MILITARY",
"DESIGN",
"EDUCATION",
"ELEARNING",
"ELECTRICAL_ENGINEERING",
"ENERGY",
"ENTERTAINMENT",
"ENVIRONMENTAL_SERVICES",
"EVENTS_SERVICES",
"FACILITIES_SERVICES",
"FACILITY_MANAGEMENT",
"FINANCIAL_SERVICES",
"FISHERY",
"FOOD",
"FUNDRAISING",
"FURNITURE",
"GARDENING_LANDSCAPING",
"GEOLOGY",
"GLASS_AND_CERAMICS",
"GRAPHIC_DESIGN",
"HEALTH_AND_FITNESS",
"HOSPITALITY",
"HUMAN_RESOURCES",
"IMPORT_AND_EXPORT",
"INDUSTRIAL_AUTOMATION",
"INFORMATION_SERVICES",
"INFORMATION_TECHNOLOGY_AND_SERVICES",
"INSURANCE",
"INTERNATIONAL_AFFAIRS",
"INTERNATIONAL_TRADE_AND_DEVELOPMENT",
"INTERNET",
"INVESTMENT_BANKING",
"JOURNALISM",
"LEGAL_SERVICES",
"LEISURE_TRAVEL_AND_TOURISM",
"LIBRARIES",
"LOGISTICS_AND_SUPPLY_CHAIN",
"LUXURY_GOODS_AND_JEWELRY",
"MACHINERY",
"MANAGEMENT_CONSULTING",
"MARITIME",
"MARKETING_AND_ADVERTISING",
"MARKET_RESEARCH",
"MECHANICAL_INDUSTRIAL_ENGINEERING",
"MEDIA_PRODUCTION",
"MEDICAL_DEVICES",
"MEDICAL_SERVICES",
"MEDICINAL_PRODUCTS",
"METAL_METALWORKING",
"METROLOGY_CONTROL_ENGINEERING",
"MINING_AND_METALS",
"MOTION_PICTURES",
"MUSEUMS_AND_CULTURAL_INSTITUTIONS",
"MUSIC",
"NANOTECHNOLOGY",
"NON_PROFIT_ORGANIZATION",
"NURSING_AND_PERSONAL_CARE",
"OIL_AND_ENERGY",
"ONLINE_MEDIA",
"OTHERS",
"OUTSOURCING_OFFSHORING",
"PACKAGING_AND_CONTAINERS",
"PAPER_AND_FOREST_PRODUCTS",
"PHOTOGRAPHY",
"PLASTICS",
"POLITICS",
"PRINTING",
"PRINT_MEDIA",
"PROCESS_MANAGEMENT",
"PROFESSIONAL_TRAINING_AND_COACHING",
"PSYCHOLOGY_PSYCHOTHERAPY",
"PUBLIC_HEALTH",
"PUBLIC_RELATIONS_AND_COMMUNICATIONS",
"PUBLISHING",
"RAILROAD",
"REAL_ESTATE",
"RECREATIONAL_FACILITIES_AND_SERVICES",
"RECYCLING_AND_WASTE_MANAGEMENT",
"RENEWABLES_AND_ENVIRONMENT",
"RESEARCH",
"RESTAURANTS_AND_FOOD_SERVICE",
"RETAIL",
"SECURITY_AND_INVESTIGATIONS",
"SEMICONDUCTORS",
"SHIPBUILDING",
"SPORTS",
"STAFFING_AND_RECRUITING",
"TAX_ACCOUNTANCY_AUDITING",
"TELECOMMUNICATION",
"TEXTILES",
"THEATER_STAGE_CINEMA",
"TIMBER",
"TRAFFIC_ENGINEERING",
"TRANSLATION_AND_LOCALIZATION",
"TRANSPORT",
"VENTURE_CAPITAL_AND_PRIVATE_EQUITY",
"VETERINARY",
"WELFARE_AND_COMMUNITY_HEALTH",
"WHOLESALE",
"WINE_AND_SPIRITS",
"WRITING_AND_EDITING",
"PHARMACEUTICALS"
),
class = "factor"
)
),
.Names = c(
"ID",
"Gender",
"Bildungseinrichtungen",
"Abschluss",
"Studienfach",
"Beschäftigungsart",
"Station.Start",
"Station.Ende",
"Bezeichnung",
"Position",
"Unternehmen",
"Unternehmensgroesse",
"Industrie"
),
row.names = 1L,
class = "data.frame"
))
It seems as if the Filtering option in RStudio's Data Viewer (View()) offers a drop down menu for a factor, when its number of levels (nlevels()) is less than 65. Otherwise it defaults to a search field:
df <- data.frame(x=as.factor(1:64))
View(df)
# "filter" yields a drop down menu
df <- data.frame(x=as.factor(1:65))
View(df)
# "filter" yields a search field
RStudio.Version()$version
# [1] ‘1.0.143’
Note that this has nothing to do with R itself, as already mentioned in the comments.

making the first row a header in a dataframe in r

I've seen this asked here: Create header of a dataframe from the first row in the data frame
and here: assign headers based on existing row in dataframe in R
and the solutions offered don't work for me.
When I transpose my dataframe (p1), the header of DF.transpose (p1t) is something new and annoying. and the first row of the p1t is what I would like to use as the header, I tried:
colnames(p1t) = p1t[1, ]
and it doesn't work!
here is how the original df appears:
File Fp1.PD_ShortSOA_FAM Fp1.PD_LongSOA_FAM Fp1.PD_ShortSOA_SEMplus_REAL Fp1.PD_ShortSOA_SEMplus_FICT
sub0001 0,446222 2,524,804 0,272959 1,281,349
sub0002 1,032,688 2,671,048 1,033,278 1,217,817
And here is how the transpose appears:
row.names V1 V2
File sub0001 sub0002
Fp1.PD_ShortSOA_FAM 0,446222 1,032,688
Fp1.PD_LongSOA_FAM 2,524,804 2,671,048
Fp1.PD_ShortSOA_SEMplus_REAL 0,272959 1,033,278
Fp1.PD_ShortSOA_SEMplus_FICT 1,281,349 1,217,817
Fp1.PD_ShortSOA_SEMminus_REAL 0,142739 1,405,100
Fp1.PD_ShortSOA_SEMminus_FICT 1,515,577 -1,990,458
How can I make "File", "sub0001","sub0002" etc... as the header?
Thanks!
Works for me (with a little trick).
x <- read.table(text = "File Fp1.PD_ShortSOA_FAM Fp1.PD_LongSOA_FAM Fp1.PD_ShortSOA_SEMplus_REAL Fp1.PD_ShortSOA_SEMplus_FICT
sub0001 0,446222 2,524,804 0,272959 1,281,349
sub0002 1,032,688 2,671,048 1,033,278 1,217,817",
header = TRUE)
x <- t(x)
colnames(x) <- x[1, ]
x <- x[-1, ]
x
sub0001 sub0002
Fp1.PD_ShortSOA_FAM "0,446222" "1,032,688"
Fp1.PD_LongSOA_FAM "2,524,804" "2,671,048"
Fp1.PD_ShortSOA_SEMplus_REAL "0,272959" "1,033,278"
Fp1.PD_ShortSOA_SEMplus_FICT "1,281,349" "1,217,817"
We can make use of transpose from data.table
library(janitor)
data.table::transpose(x, keep.names = 'File') %>%
row_to_names(1)
# File sub0001 sub0002
#2 Fp1.PD_ShortSOA_FAM 0,446222 1,032,688
#3 Fp1.PD_LongSOA_FAM 2,524,804 2,671,048
#4 Fp1.PD_ShortSOA_SEMplus_REAL 0,272959 1,033,278
#5 Fp1.PD_ShortSOA_SEMplus_FICT 1,281,349 1,217,817
data
x <- structure(list(File = structure(1:2, .Label = c("sub0001", "sub0002"
), class = "factor"), Fp1.PD_ShortSOA_FAM = structure(1:2, .Label = c("0,446222",
"1,032,688"), class = "factor"), Fp1.PD_LongSOA_FAM = structure(1:2, .Label = c("2,524,804",
"2,671,048"), class = "factor"), Fp1.PD_ShortSOA_SEMplus_REAL = structure(1:2, .Label = c("0,272959",
"1,033,278"), class = "factor"), Fp1.PD_ShortSOA_SEMplus_FICT = structure(2:1, .Label = c("1,217,817",
"1,281,349"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))

Using apply with a user-defined function in R

I have defined the following function in r:
#A function that compares color and dates to determine if there is a match
getTagColor <- function(color, date){
for (i in (1:nrow(TwistTieFix))){
if ((color == TwistTieFix$color_match[i]) &
(date > TwistTieFix$color_match[i]) &
(date <= TwistTieFix$julian_cut_off_date[i])) {
Data$color_code <- TwistTieFix$color_code[i]
print(Data$color_code)
}
}
}
I then used apply() in an attempt to apply the function to each row.
#Apply the above function to the data set
testData <- apply(Data, 1, getTagColor(Data$tag_color,Data$julian_date))`
The goal of the code is to use two variables in Data and find another value to put into a new column in Data (color_code) that will be based on the information in TwistTieFix. When I run the code, I get a list of warnings saying
In if ((color == TwistTieFix$color_match[i]) & (date > ... :
the condition has length > 1 and only the first element will be used
I cannot determine why the function does not use the date and color from each row and use it in the function (at least that is what I think is going wrong here). Thanks!
Here are examples of the data frames being used:
TwistTieFix
color_name date color_code cut_off_date color_match julian_start julian_cut_off_date
yellow 2013-08-12 y1 2001-07-02 yellow 75 389
blue 2000-09-28 b1 2001-08-12 blue 112 430
Data
coll_date julian_date tag_color
2013-08-13 76 yellow
2013-08-14 76 yellow
2000-09-29 112 blue
Data has a lot more columns of different variables, but I am not allowed to include all of the columns. However, I have included the columns in Data that I am referencing in function. The data sets are loaded into r using read.csv and are from Excel csv files.
To me, it seems like you want to join Data and TwistTieFix where tag_color=color_match and julian_start <= julian_date <= julian_cut_off_date. Here are your sample data.sets in dput form
TwistTieFix <- structure(list(color_name = structure(c(2L, 1L), .Label = c("blue",
"yellow"), class = "factor"), date = structure(c(2L, 1L), .Label = c("2000-09-28",
"2013-08-12"), class = "factor"), color_code = structure(c(2L,
1L), .Label = c("b1", "y1"), class = "factor"), cut_off_date = structure(1:2, .Label = c("2001-07-02",
"2001-08-12"), class = "factor"), color_match = structure(c(2L,
1L), .Label = c("blue", "yellow"), class = "factor"), julian_start = c(75L,
112L), julian_cut_off_date = c(389L, 430L)), .Names = c("color_name",
"date", "color_code", "cut_off_date", "color_match", "julian_start",
"julian_cut_off_date"), class = "data.frame", row.names = c(NA,
-2L))
Data <- structure(list(coll_date = structure(c(2L, 3L, 1L), .Label = c("2000-09-29",
"2013-08-13", "2013-08-14"), class = "factor"), julian_date = c(76L,
76L, 112L), tag_color = structure(c(2L, 2L, 1L), .Label = c("blue",
"yellow"), class = "factor")), .Names = c("coll_date", "julian_date",
"tag_color"), class = "data.frame", row.names = c(NA, -3L))
An easy way to perform this merge would be using the data.table library. You can do
#convert to data.table and set keys
ttf<-setDT(TwistTieFix)
setkey(ttf, color_match, julian_start)
dt<-setDT(Data)
setkey(dt, tag_color, julian_date)
#merge and extract columns
ttf[dt, roll=T][julian_start<julian_cut_off_date,list(coll_date,
julian_date=julian_start, tag_color=color_match, color_code)]
to get
coll_date julian_date tag_color color_code
1: 2000-09-29 112 blue b1
2: 2013-08-13 76 yellow y1
3: 2013-08-14 76 yellow y1

Resources