Cartesian Rolling Join using Data.table - r

I have two tables:
dat: contains the data
dates: contains the table of dates
library(data.table)
dates = structure(list(date = structure(c(17562, 17590, 17621, 17651,
17682, 17712, 17743, 17774, 17804, 17835, 17865, 17896), class = "Date")),
row.names = c(NA, -12L), class = "data.frame")
dat = structure(list(date = structure(c(17546, 17743, 17778, 17901,
17536, 17806, 17901, 17981, 17532, 17722, 17969, 18234), class = "Date"),
country = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), .Label = c("AAA", "BBB", "CCC"), class = "factor"),
state = structure(c(1L, 1L, 2L, 3L, 4L, 1L, 2L, 5L, 6L, 1L,
2L, 2L), .Label = c("S1", "S2", "S3", "S4", "S5", "S6"), class = "factor"),
item = structure(c(1L, 2L, 4L, 6L, 3L, 5L, 3L, 2L, 2L, 4L,
5L, 7L), .Label = c("M1", "M2", "M3", "M4", "M5", "M6", "M7"
), class = "factor"), value = c(67L, 10L, 50L, 52L, 93L,
50L, 62L, 46L, 6L, 30L, 30L, 14L)), row.names = c(NA, -12L
), class = "data.frame")
dates = data.table(dates)
dat = data.table(dat)
setkey(dates, date)
setkey(dat, date)
The result I'm after is below. I.e doing a rolling join with each individual row of dat and then combining the result.
rbind(
dat[1,][dates, roll = 90],
dat[2,][dates, roll = 90],
dat[3,][dates, roll = 90],
...
dat[12,][dates, roll = 90]
)
My actual dataset is much larger so it's no practical to list every row of dat. Is there a short hand way of doing the same thing without a loop?

If I understand your intent correctly, you want to rollover the records for 90 days.
I used a cross join and then used the rollover criteria to subset
Your original tables:
library(data.table)
dates = structure(list(date = structure(c(17562, 17590, 17621, 17651,
17682, 17712, 17743, 17774, 17804, 17835, 17865, 17896), class = "Date")),
row.names = c(NA, -12L), class = "data.frame")
dat = structure(list(date = structure(c(17546, 17743, 17778, 17901,
17536, 17806, 17901, 17981, 17532, 17722, 17969, 18234), class = "Date"),
country = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), .Label = c("AAA", "BBB", "CCC"), class = "factor"),
state = structure(c(1L, 1L, 2L, 3L, 4L, 1L, 2L, 5L, 6L, 1L,
2L, 2L), .Label = c("S1", "S2", "S3", "S4", "S5", "S6"), class = "factor"),
item = structure(c(1L, 2L, 4L, 6L, 3L, 5L, 3L, 2L, 2L, 4L,
5L, 7L), .Label = c("M1", "M2", "M3", "M4", "M5", "M6", "M7"
), class = "factor"), value = c(67L, 10L, 50L, 52L, 93L,
50L, 62L, 46L, 6L, 30L, 30L, 14L)), row.names = c(NA, -12L
), class = "data.frame")
dates = data.table(dates)
dat = data.table(dat)
Note, I haven't setkey.
I am using a cross join function from the reference: How to do cross join in R?
CJ.table.1 <- function(X,Y)
setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
Then I cross join, subset for the roll join, rename columns and sort
dsn1<-CJ.table.1(dat,dates)[i.date-date<=90 & i.date-date>=0][,.(date=i.date,country, state, item, value)][order(country, state, item, value,date),]

This is not necessarily the best way to do it, but you could simply write a loop here to iterate through your data:
df <- data.frame()
for (i in 1:nrow(dat)){
df <- rbind(df, dat[i,][dates, roll = 90])
}
head(df)
date country state item value
1: 2018-01-31 CCC S6 M2 6
2: 2018-02-28 CCC S6 M2 6
3: 2018-03-31 CCC S6 M2 6
4: 2018-04-30 <NA> <NA> <NA> NA
5: 2018-05-31 <NA> <NA> <NA> NA
Edit: just saw you said "without a loop", it's been a long day. This is one way to solve the problem though.

Related

R replace minimum date values per group

I have a df with observations on different groups for a year. However, date of the first observation can differ slightly per group (generally within the first days of the year). I'm planning to show these groups in one lineplot and I want them all to start on "2021-01-01".
How can I recode my date variable as such that the first occurrence (min(Date)?) per group is set to "2021-01-01"?
Here is a small subset, with the X, Y, Z having different starting dates. Thanks!
structure(list(Date = structure(c(18637, 18644, 18651, 18658,
18665, 18672, 18679, 18686, 18693, 18700, 18707, 18714, 18721,
18728, 18735, 18636, 18643, 18651, 18656, 18665, 18672, 18676,
18686, 18693, 18700, 18707, 18714, 18720, 18727, 18735, 18635,
18643, 18649, 18658, 18662, 18670, 18677, 18684, 18692, 18700,
18707, 18713, 18718, 18728, 18735), class = "Date"), Maand = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("jan",
"feb", "mrt", "apr", "mei", "jun", "jul", "aug", "sep", "okt",
"nov", "dec"), class = c("ordered", "factor")), UPV2 = c(339L,
69L, 59L, 48L, 77L, 95L, 54L, 61L, 99L, 95L, 67L, 71L, 54L, 98L,
98L, 8L, 6L, 11L, 7L, 15L, 7L, 5L, 4L, 22L, 13L, 4L, 5L, 14L,
14L, 7L, 6L, 7L, 8L, 13L, 2L, 9L, 9L, 13L, 4L, 9L, 8L, 8L, 4L,
14L, 4L), VAR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"), class = "factor")), row.names = c(NA,
-45L), groups = structure(list(VAR = structure(1:3, .Label = c("X",
"Y", "Z"), class = "factor"), .rows = structure(list(1:15, 16:30,
31:45), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
This solution with dplyr (and lubridate) will target every occurrence of the minimum Date for each group, and replace it with your common starting date of DEFAULT_DATE. As of my recent revision, it will also update the custom month abbreviation in Maand.
library(dplyr)
library(lubridate)
# ...
# Code to generate your data.frame "df".
# ...
DEFAULT_DATE <- as.Date("2021-01-01")
df <- df %>%
group_by(VAR) %>%
mutate(# Update the custom month abbreviation for every "min(Date)" in each group.
Maand = if_else(Date == min(Date),
# Pick out the corresponding level of the factor.
ordered(levels(Maand)[month(DEFAULT_DATE)], levels = levels(Maand)),
Maand),
# Replace every "min(Date)" in each group.
Date = if_else(Date == min(Date), DEFAULT_DATE, Date)) %>%
ungroup()
Keep in mind that most of the complication here arises from your custom abbreviations for month names, as factorized (with ordering) in the Maand column.
Fortunately, my revised solution addresses this challenge. If a new group "A" were added to the mix, and its earliest Date were 2021-03-07, then its Maand would be your custom abbreviation for "March", which in this case is "mrt". When applying my transformation, that date would be updated to DEFAULT_DATE, which in this case is 2021-01-01. Furthermore, the mutate() would also ensure that the Maand is updated (here to "jan"): to the level of the factor (here the 1st level) that corresponds to the month of the DEFAULT_DATE (here the 1st month of the year).

Reorder geom_col by grouping variable (Error: Column `` can't be modified because it's a grouping variable)

I have this df,
df <- structure(list(Gender = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", "Female", "Male",
"Q6 - OBS: Sex of Respondent"), class = "factor"), Incident = c("Death",
"Detention", "Extortion", "Kidnapping", "Physical_abuse", "Robbery",
"Sexual_assault", "Death", "Detention", "Extortion", "Kidnapping",
"Physical_abuse", "Robbery", "Sexual_assault"), Victim = structure(c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("",
"No", "Q54 - Did you witness any migrant deaths during your journey?",
"Refused", "Yes", "Q69 - Did you experience any physical abuse or harassment (of a non-sexual nature) during your journey?",
"Q62 - Did you witness or experience any sexual assault or harassment during your journey?",
"Q75 - Have you been kidnapped or otherwise held against your will during your journey?",
"Q96 - Have you been detained by the police, military, militia or immigration officials during your journey?",
"Q84 - Have you ever been robbed during your journey?", "Q90 - Did you have to give government officials gifts, services or bribes during your journey?"
), class = "factor"), n = c(253L, 300L, 1978L, 73L, 740L, 646L,
553L, 436L, 816L, 4052L, 194L, 1196L, 1059L, 259L), Percent = c(8,
10, 65, 2, 24, 21, 18, 6, 12, 59, 3, 17, 15, 4)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -14L), groups = structure(list(
Gender = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("", "Female", "Male", "Q6 - OBS: Sex of Respondent"
), class = "factor"), Incident = c("Death", "Detention",
"Extortion", "Kidnapping", "Physical_abuse", "Robbery", "Sexual_assault",
"Death", "Detention", "Extortion", "Kidnapping", "Physical_abuse",
"Robbery", "Sexual_assault"), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L)), row.names = c(NA,
-14L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
which I plotted like this:
df %>%
ggplot(aes(x=Incident, y=Percent, fill=Gender))+
geom_col(position = "dodge", width=0.72)
Now I need to sort Incident from higher total percentage to lower total percentage, so that Extortion comes first on the left, followed by Physical abuse, etc. I have tried:
df %>%
mutate(Incident=reorder(Incident, -Percent)) %>%
ggplot(aes(x=Incident, y=Percent, fill=Gender))+
geom_col(position = "dodge", width=0.72)
But I get the error:
Error: Column `Incident` can't be modified because it's a grouping variable
I have then tried ungroup, or fct_rev, but I cannot make it work! The only thing that works is to export the df as csv, to then import it again, and then it works. But of course that is not very efficient... Anybody please help!

Finding and Filling Missing Observations (Whole Rows not NA Values) with Mean Values

I am hoping to get some help identifying:
The location of missing observations when no NA values are present
(entire row is missing).
Create a row for the missing data with mean values based on mean values from certain categories.
My df:
Numerical load data from hockey players during practices over the season
Occasionally the accelerometers do not work during practice even though the player practiced. So to make sure we can still track the work they did during practice, I would like to insert the average mean value from their position (forward, defense, or goalie) from that practice. (i.e. if a goalie’s accelerometer does not work I would like to take the average loads of the other goalies and insert it into that player’s observational row for the practice).
This would be a simpler task if there were rows ALL players each practice and NA values for loads when the accelerometer doesn’t work BUT when I download the data in a csv from the online cloud there are only rows for players who had working units. So that part is out of my control.
> head(DummyLoads)
Name Date Load Position
1 Jim 2019-10-19 900 2.100 Forward
2 Bob 2019-10-19 900 2.100 Forward
3 Dave 2019-10-19 900 2.100 Forward
4 Steve 2019-10-19 850 2.312 Forward
5 Fred 2019-10-19 850 2.312 Defense
6 Ray 2019-10-19 850 2.312 Defense
DummyLoads <- structure(list(Name = structure(c(4L, 1L, 2L, 6L, 3L, 5L, 4L, 1L, 2L, 3L, 5L, 4L, 1L, 2L, 6L, 3L, 5L, 2L, 6L, 3L, 5L),
.Label = c("Bob", "Dave", "Fred", "Jim", "Ray", "Steve"),
class = "factor"),
Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L),
.Label = c("2019-10-19", "2019-10-20", "2019-10-21", "2019-10-22"), class = "factor"),
Load = c(900L, 900L, 900L, 850L, 850L, 850L, 789L, 789L, 789L, 960L, 960L, 909L, 909L, 909L, 991L, 991L, 991L, 720L, 717L, 717L, 717L),
Load.Min = c(2.1, 2.1, 2.1, 2.312, 2.312, 2.312, 2.22, 2.22, 2.22, 2, 2, 1.88, 1.88, 1.88, 1.99, 1.99, 1.99, 2.1, 2.3, 2.3, 2.3), Position = structure(c(2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L),
.Label = c("Defense", "Forward"), class = "factor")), class = "data.frame", row.names = c(NA, -21L))
ggplot(DummyLoads, aes(x = Name, y = Load, fill = Position))+
geom_bar(stat = "identity")+
facet_grid(~Date)
Here is a chart showing players with missing data.
Missing Player Loads
Ideally I would like to be able to identify those missing data points without having to plot it first. I also want to avoid having to manually calculate means each time and then input. Hoping to find an automated solution because I will have a full season of practices to do this with, but understand that might be tricky!
Thank you in advance for any suggestions. I apologize if I didn’t explain things clearly.
Updated to the current question:
DummyLoads <- DummyLoads %>%
ungroup()
full_data <- expand.grid(
Athlete = DummyLoads %>%
pull(Athlete) %>%
unique(),
Date = DummyLoads %>%
pull(Date) %>%
unique())
full_data %>%
# join incomplete data onto full data
left_join(DummyLoads, by = c("Athlete", "Date")) %>%
# assign the position to each player
# in the example data, some players do ahve different positions
# if this is true, than it would be unclear which average should be
# considered. Therefore, I assumed their position is constant
left_join(DummyLoads %>%
select(Athlete, Position) %>%
distinct(Athlete, .keep_all=TRUE),
by = "Athlete") %>%
# keep both to check the differences
rename(Position = Position.y) %>%
group_by(Date, Position) %>%
# if Load is missing, take the mean of Loads which is grouped
mutate(Load2 = coalesce(PL_Avg,
mean(PL_Avg, na.rm = T)))
Generic Base R solution:
# Mark out NA rows flatten in single observation (each element denoting a vector of the df):
is_val_na <- apply(data.frame(lapply(DummyLoads,
function(x){is.na(x)})), 1, paste, collapse = ", " )
# Split up using the grouping var "Name", and impute the mean where NA, coerce list to df:
DummyLoads_imputed <- do.call("rbind", lapply(split(DummyLoads, DummyLoads$Position),
function(x){
if(is.numeric(x)){
ifelse(is.na(x), mean(x, na.rm = TRUE), x)
}else{x}
}
)
)
# Bind the data.frame with a factor vector holding the T/F values:
DummyLoads_imputed <- cbind(DummyLoads_imputed, row_na = as.factor(is_val_na))
Data Used:
DummyLoads <- structure(list(Name = structure(c(4L, 1L, 2L, 6L, 3L, 5L, 4L, 1L, 2L, 3L, 5L, 4L, 1L, 2L, 6L, 3L, 5L, 2L, 6L, 3L, 5L),
.Label = c("Bob", "Dave", "Fred", "Jim", "Ray", "Steve"),
class = "factor"), Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("2019-10-19", "2019-10-20", "2019-10-21", "2019-10-22"),
class = "factor"), Load = c(900L, 900L, 900L, 850L, 850L, 850L, 789L, 789L, 789L, 960L,
960L, 909L, 909L, 909L, 991L, 991L, 991L, 720L, 717L, 717L, 717L),
Position = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L),
.Label = c("Defense", "Forward"), class = "factor")), row.names = c(NA, -21L), class = "data.frame")

Select observations in R based on maximum number listed in a column

I hope I've done this correctly! I have two data frames:
teachers = structure(list(Teacher = c(123L, 123L, 123L, 123L, 124L),
tStudents = c(3L, 3L, 4L, 3L, 4L), Term = c(1801L, 1802L, 1801L, 1803L, 1802L),
Course = structure(c(5L, 6L, 7L, 6L, 8L), .Label = c("ENGG",
"ENGG2", "LITT", "LITT2", "MATH", "MATH2", "PHYS", "SCIE"
), class = "factor")), .Names = c("Teacher", "tStudents", "Term", "Course"), row.names = c(NA, 5L), class = "data.frame")
enrols = structure(list(UniqueStudent = structure(c(3L, 2L, 1L, 5L, 4L),
.Label = c("1801-ENGG-N1-abcd1#abc.edu.au", "1801-MATH-C1-abcd1#abc.edu.au","1801-PHYS-L1-abcd1#abc.edu.au", "1802-MATH2-G1-abcd1#abc.edu.au", "1802-SCIE-K2-abcd1#abc.edu.au"), class = "factor"), Term = c(1801L,1801L, 1801L, 1802L, 1802L), Student.Email.Addresses = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "abcd1#abc.edu.au", class = "factor"), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "s12344", class = "factor"),
Gender.Description = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "M", class = "factor"),
Age = c(12L, 12L, 12L, 12L, 12L), Program.Short.Description = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "LSC1", class = "factor"), Term.CC.CN = structure(c(3L,
2L, 1L, 5L, 4L), .Label = c("1801-ENGG-N1", "1801-MATH-C1",
"1801-PHYS-L1", "1802-MATH2-G1", "1802-SCIE-K2"), class = "factor"),
Course.Code = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("ENGG",
"MATH", "MATH2", "PHYS", "SCIE"), class = "factor"), Class.Number = structure(c(4L,
1L, 5L, 3L, 2L), .Label = c("C1", "G1", "K2", "L1", "N1"), class = "factor"),
Teacher = c(123L, 123L, 125L, 124L, 123L)), .Names = c("UniqueStudent", "Term", "Student.Email.Addresses", "ID", "Gender.Description", "Age", "Program.Short.Description", "Term.CC.CN", "Course.Code", "Class.Number", "Teacher"), row.names = c(NA, 5L), class = "data.frame")
teachers$tStudents lists the maximum number of students allowed to be allocated to a teacher per Term and Course. I've also pre-merged the Course enrolments in the "enrols" data to list the Teachers for each course.
So, what I need to do is create class lists from the enrols data using the teachers data by c("teacher", "Term", "Course") but my class lists can only select a maximum value of students based on the number listed in teachers$tStudents. Ideally, I'd also like to select a representative distribution of students so that the new class lists have both genders, different ages and are from different Program.Short.Description.
I've tried merging in different ways in dplyr and can create full lists with all students but haven't been able to use the teachers$tStudents column to limit the number of observations to select. Is this possible?

Remove a table from a dataset [duplicate]

This question already has answers here:
Delete rows that exist in another data frame? [duplicate]
(3 answers)
Closed 4 years ago.
I am having trouble removing selected data from a data set. I have an example of the set, and I have another table of selected rows (toremove). I am trying to remove (toremove) from the the original set.
I tried to use setdiff, but while there were rows cut dow (according to environment variables), it was not the selected data removed.
Prod1<- Prod[setdiff(rownames(Prod),rownames(toremove )),]
Example of entire dataset in dput:
Prod <- structure(list(CountryCode = c(5000L, 5300L, 5300L, 5000L, 5400L,
5300L, 5400L, 5200L, 5200L, 5200L, 5000L, 5000L), Country = structure(c(4L,
2L, 2L, 4L, 3L, 2L, 3L, 1L, 1L, 1L, 4L, 4L), .Label = c("Americas + (Total)",
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"),
ItemCode = c(1814L, 1717L, 1817L, 116L, 1717L, 1817L, 1817L,
156L, 1717L, 1817L, 1735L, 1800L), Item = structure(c(3L,
2L, 1L, 4L, 2L, 1L, 1L, 5L, 2L, 1L, 6L, 7L), .Label = c("Cereals (Rice Milled Eqv) + (Total)",
"Cereals,Total + (Total)", "Coarse Grain, Total + (Total)",
"Potatoes", "Sugar cane", "Vegetables Primary + (Total)",
"Vegetables&Melons, Total + (Total)"), class = "factor"),
ElementGroup = c(31L, 31L, 31L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L), ElementCode = c(5312L, 5312L, 5312L,
5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L
), Element = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Area harvested", "Production"
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Ha", "tonnes"
), class = "factor"), Y1961 = c(3.29e+08, 2.72e+08, 2.72e+08,
2.71e+08, 2.64e+08, 2.63e+08, 2.63e+08, 2.36e+08, 2.28e+08,
2.24e+08, 2.23e+08, 2.23e+08), Y1962 = c(3.27e+08, 2.76e+08,
2.76e+08, 2.53e+08, 2.81e+08, 2.78e+08, 2.81e+08, 2.22e+08,
2.4e+08, 2.36e+08, 2.23e+08, 2.23e+08), Y1963 = c(3.33e+08,
2.76e+08, 2.76e+08, 2.7e+08, 2.5e+08, 2.95e+08, 2.49e+08,
2.26e+08, 2.62e+08, 2.58e+08, 2.23e+08, 2.23e+08), Y1964 = c(3.29e+08,
2.82e+08, 2.82e+08, 2.85e+08, 2.96e+08, 3.1e+08, 2.96e+08,
2.43e+08, 2.49e+08, 2.45e+08, 2.26e+08, 2.26e+08)), .Names = c("CountryCode",
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode",
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA,
-12L))
Selected data to remove:
toremove <- structure(list(CountryCode = c(5000L, 5400L, 5300L, 5400L, 5200L
), Country = structure(c(4L, 3L, 2L, 3L, 1L), .Label = c("Americas + (Total)",
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"),
ItemCode = c(116L, 1717L, 1817L, 1817L, 1717L), Item = structure(c(3L,
2L, 1L, 1L, 2L), .Label = c("Cereals (Rice Milled Eqv) + (Total)",
"Cereals,Total + (Total)", "Potatoes"), class = "factor"),
ElementGroup = c(51L, 51L, 51L, 51L, 51L), ElementCode = c(5510L,
5510L, 5510L, 5510L, 5510L), Element = structure(c(1L, 1L,
1L, 1L, 1L), .Label = "Production", class = "factor"), Unit = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "tonnes", class = "factor"), Y1961 = c(2.71e+08,
2.64e+08, 2.63e+08, 2.63e+08, 2.28e+08), Y1962 = c(2.53e+08,
2.81e+08, 2.78e+08, 2.81e+08, 2.4e+08), Y1963 = c(2.7e+08,
2.5e+08, 2.95e+08, 2.49e+08, 2.62e+08), Y1964 = c(2.85e+08,
2.96e+08, 3.1e+08, 2.96e+08, 2.49e+08)), .Names = c("CountryCode",
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode",
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA,
-5L))
# Answer #1 ---------------------------------------------------------------
AnswerinComments <- Prod[!(rownames(Prod) %in% rownames(toremove )),]
Also found here: Delete rows that exist in another data frame?
# Answer #2 ---------------------------------------------------------------
require(sqldf)
AnotherWay <- sqldf("Delete a from Prod a inner join toremove b
on a.CountryCode = b.CountryCode
and a.ElementCode = b. ElementCode")
# Answer #3 ---------------------------------------------------------------
all <- rbind(Prod, toremove)
duplicated(all)
YetAnother <- all[!duplicated(all,fromLast = FALSE) &
!duplicated(all,fromLast = TRUE),]
The popular dplyr package has a setdiff function, too. However, it needs identical data structures - in your case: same factor levels:
## factors to character vectors if needed...
# idx <- sapply(Prod, class) == "factor"
# Prod[idx] <- sapply(Prod[idx], as.character)
# toremove[idx] <- sapply(toremove[idx], as.character)
library(dplyr)
setdiff(Prod, toremove)

Resources