Remove a table from a dataset [duplicate] - r

This question already has answers here:
Delete rows that exist in another data frame? [duplicate]
(3 answers)
Closed 4 years ago.
I am having trouble removing selected data from a data set. I have an example of the set, and I have another table of selected rows (toremove). I am trying to remove (toremove) from the the original set.
I tried to use setdiff, but while there were rows cut dow (according to environment variables), it was not the selected data removed.
Prod1<- Prod[setdiff(rownames(Prod),rownames(toremove )),]
Example of entire dataset in dput:
Prod <- structure(list(CountryCode = c(5000L, 5300L, 5300L, 5000L, 5400L,
5300L, 5400L, 5200L, 5200L, 5200L, 5000L, 5000L), Country = structure(c(4L,
2L, 2L, 4L, 3L, 2L, 3L, 1L, 1L, 1L, 4L, 4L), .Label = c("Americas + (Total)",
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"),
ItemCode = c(1814L, 1717L, 1817L, 116L, 1717L, 1817L, 1817L,
156L, 1717L, 1817L, 1735L, 1800L), Item = structure(c(3L,
2L, 1L, 4L, 2L, 1L, 1L, 5L, 2L, 1L, 6L, 7L), .Label = c("Cereals (Rice Milled Eqv) + (Total)",
"Cereals,Total + (Total)", "Coarse Grain, Total + (Total)",
"Potatoes", "Sugar cane", "Vegetables Primary + (Total)",
"Vegetables&Melons, Total + (Total)"), class = "factor"),
ElementGroup = c(31L, 31L, 31L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L), ElementCode = c(5312L, 5312L, 5312L,
5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L
), Element = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("Area harvested", "Production"
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Ha", "tonnes"
), class = "factor"), Y1961 = c(3.29e+08, 2.72e+08, 2.72e+08,
2.71e+08, 2.64e+08, 2.63e+08, 2.63e+08, 2.36e+08, 2.28e+08,
2.24e+08, 2.23e+08, 2.23e+08), Y1962 = c(3.27e+08, 2.76e+08,
2.76e+08, 2.53e+08, 2.81e+08, 2.78e+08, 2.81e+08, 2.22e+08,
2.4e+08, 2.36e+08, 2.23e+08, 2.23e+08), Y1963 = c(3.33e+08,
2.76e+08, 2.76e+08, 2.7e+08, 2.5e+08, 2.95e+08, 2.49e+08,
2.26e+08, 2.62e+08, 2.58e+08, 2.23e+08, 2.23e+08), Y1964 = c(3.29e+08,
2.82e+08, 2.82e+08, 2.85e+08, 2.96e+08, 3.1e+08, 2.96e+08,
2.43e+08, 2.49e+08, 2.45e+08, 2.26e+08, 2.26e+08)), .Names = c("CountryCode",
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode",
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA,
-12L))
Selected data to remove:
toremove <- structure(list(CountryCode = c(5000L, 5400L, 5300L, 5400L, 5200L
), Country = structure(c(4L, 3L, 2L, 3L, 1L), .Label = c("Americas + (Total)",
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"),
ItemCode = c(116L, 1717L, 1817L, 1817L, 1717L), Item = structure(c(3L,
2L, 1L, 1L, 2L), .Label = c("Cereals (Rice Milled Eqv) + (Total)",
"Cereals,Total + (Total)", "Potatoes"), class = "factor"),
ElementGroup = c(51L, 51L, 51L, 51L, 51L), ElementCode = c(5510L,
5510L, 5510L, 5510L, 5510L), Element = structure(c(1L, 1L,
1L, 1L, 1L), .Label = "Production", class = "factor"), Unit = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "tonnes", class = "factor"), Y1961 = c(2.71e+08,
2.64e+08, 2.63e+08, 2.63e+08, 2.28e+08), Y1962 = c(2.53e+08,
2.81e+08, 2.78e+08, 2.81e+08, 2.4e+08), Y1963 = c(2.7e+08,
2.5e+08, 2.95e+08, 2.49e+08, 2.62e+08), Y1964 = c(2.85e+08,
2.96e+08, 3.1e+08, 2.96e+08, 2.49e+08)), .Names = c("CountryCode",
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode",
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA,
-5L))

# Answer #1 ---------------------------------------------------------------
AnswerinComments <- Prod[!(rownames(Prod) %in% rownames(toremove )),]
Also found here: Delete rows that exist in another data frame?
# Answer #2 ---------------------------------------------------------------
require(sqldf)
AnotherWay <- sqldf("Delete a from Prod a inner join toremove b
on a.CountryCode = b.CountryCode
and a.ElementCode = b. ElementCode")
# Answer #3 ---------------------------------------------------------------
all <- rbind(Prod, toremove)
duplicated(all)
YetAnother <- all[!duplicated(all,fromLast = FALSE) &
!duplicated(all,fromLast = TRUE),]

The popular dplyr package has a setdiff function, too. However, it needs identical data structures - in your case: same factor levels:
## factors to character vectors if needed...
# idx <- sapply(Prod, class) == "factor"
# Prod[idx] <- sapply(Prod[idx], as.character)
# toremove[idx] <- sapply(toremove[idx], as.character)
library(dplyr)
setdiff(Prod, toremove)

Related

Having issues with or_gam function in oddsratio package

Data
Here is the dput of my data:
heart <- structure(list(died = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), levels = c("Survive",
"Died"), class = c("labelled", "factor"), label = "Death", format = "%8.0g", value.label.table = structure(0:1, names = c("Survive",
"Died"))), procedure = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), levels = c("PTCA",
"CABG"), class = c("labelled", "factor"), label = "1=CABG; 0=PTCA", format = "%8.0g", value.label.table = structure(0:1, names = c("PTCA",
"CABG"))), age = structure(c(65L, 69L, 76L, 65L, 69L, 67L, 69L,
66L, 74L, 67L, 76L, 76L, 68L, 77L, 77L, 70L, 68L, 69L, 68L, 70L,
71L, 75L, 69L, 72L, 66L, 77L, 68L, 78L, 77L, 69L, 65L, 70L, 65L,
70L), label = "PATIENT AGE", class = c("labelled", "integer"), format = "%8.0g"),
gender = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L), levels = c("Female",
"Male"), class = c("labelled", "factor"), label = "Gender", format = "%8.0g", value.label.table = structure(0:1, names = c("Female",
"Male"))), los = structure(c(10L, 7L, 7L, 8L, 1L, 7L, 2L,
9L, 3L, 1L, 6L, 14L, 4L, 13L, 10L, 4L, 2L, 2L, 10L, 2L, 6L,
12L, 4L, 2L, 2L, 14L, 3L, 2L, 5L, 9L, 3L, 3L, 3L, 2L), label = "LOS", class = c("labelled",
"integer"), format = "%8.0g"), type = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L), levels = c("Elective", "Emer/Urg"), class = c("labelled",
"factor"), label = "Severity", format = "%8.0g", value.label.table = structure(0:1, names = c("Elective",
"Emer/Urg")))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-34L), stata.info = list(datalabel = "AZ 1991; CABG(106,107) & PTCA(112)",
version = 12L, time.stamp = "17 Feb 2015 12:45", val.labels = c("diedlb",
"pxllb", "", "sexllb", "", "typellb"), label.table = list(
pxllb = structure(0:1, names = c("PTCA", "CABG")), typellb = structure(0:1, names = c("Elective",
"Emer/Urg")), sexllb = structure(0:1, names = c("Female",
"Male")), diedlb = structure(0:1, names = c("Survive",
"Died")))))
Problem
So far I have fit multiple models like this using tensor splines to encode main and interaction effects:
#### Load Libraries ####
library(oddsratio)
library(mgcv)
library(splines)
library(tidyverse)
#### Fit GAM With Tensor Splines ####
fit <- gam(factor(died)
~ ti(age)
+ ti(los)
+ ti(age,los),
data = heart,
family = binomial)
However, when I try to plot the main effects:
plot_gam(model = fit,
pred = "age")
I get this odd warning:
Error in plot_df[[set_pred]]$x : $ operator is invalid for atomic vectors
I tried calling the function plot_gam explicitly, which look like this, but I'm a bit fuzzy on why this code is causing the issue. Calling whatever gam_to_df is also didn't seem to clarify the problem:
function (model = NULL, pred = NULL, col_line = "blue", ci_line_col = "black",
ci_line_type = "dashed", ci_fill = "grey", ci_alpha = 0.4,
ci_line_size = 0.8, sm_fun_size = 1.1, title = NULL, xlab = NULL,
ylab = NULL, limits_y = NULL, breaks_y = NULL)
{
df <- gam_to_df(model, pred)
if (is.null(xlab)) {
xlab <- df[[pred]]$xlab
}
if (is.null(ylab)) {
ylab <- df[[pred]]$ylab
}
plot_gam <- ggplot(df, aes_(~x, ~y)) + geom_line(colour = col_line,
size = sm_fun_size) + geom_line(aes_(~x, ~se_upr), linetype = ci_line_type,
colour = ci_line_col, size = ci_line_size) + geom_line(aes_(~x,
~se_lwr), linetype = ci_line_type, colour = ci_line_col,
size = ci_line_size) + geom_ribbon(aes_(x = ~x, ymin = ~se_lwr,
ymax = ~se_upr), fill = ci_fill, alpha = ci_alpha) +
ylab(ylab) + xlab(xlab)
if (!is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y),
limits = c(limits_y))
}
else if (!is.null(limits_y) & is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(limits = limits_y)
}
else if (is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y))
}
if (!is.null(title)) {
plot_gam <- plot_gam + ggtitle(title)
}
return(plot_gam)
}
I would greatly appreciate a resolution to this issue. The plot_gam function is super useful for what I'm trying to achieve.
I was inspecting the code. Error is caused by a fairly naïve implementation of oddsratio::gam_to_df function.
gam_to_df <- function (model = NULL, pred = NULL)
{
plot_df <- no_plot(model)
set_pred <- grep(paste0("\\b", pred, "\\b"), plot_df)
df <- data.frame(x = plot_df[[set_pred]]$x, se_upr = plot_df[[set_pred]]$fit +
plot_df[[set_pred]]$se, se_lwr = plot_df[[set_pred]]$fit -
plot_df[[set_pred]]$se, y = plot_df[[set_pred]]$fit)
return(df)
}
no_plot <- function (model = NULL)
{
png("temp.xyz")
plot_df <- plot(model, pages = 1)
dev.off()
file.remove("temp.xyz")
return(invisible(plot_df))
}
The function first create a plot and return it (it is a list) then extract the element that contains "age" somewhere in one of the elements, searching it via grep. Since two of three elements contains "age" it returns a vector of two elelments c(1,3). Thus plot_df[[set_pred]] instead of selecting the first element, selects the first of the list and then the third element inside of the list, which is a vector.
Hence the error "operator is invalid for atomic vectors"
Appears to be a bug.
EDIT
Filled an issue at https://github.com/pat-s/oddsratio/issues/54

How can I easily ad one colour in each bar and make it descending? [duplicate]

This question already has answers here:
Reorder bars in geom_bar ggplot2 by value
(3 answers)
Change bar plot colour in geom_bar with ggplot2 in r
(2 answers)
Closed last year.
How can I easily ad one color in each bar and make it descending?
QG4 %>%
filter(value=="Yes") %>%
ggplot(aes(y=Freq, x=variable))+
geom_bar(position = "dodge", stat = "identity")+
theme_bw()+
coord_flip()+
labs(x="Mode", y=NULL, title = "What is your usual (or most frequently used) mode of travel to work/place of study?")
I used dput(QG4) to avoid using a picture of the dataset:
structure(list(variable = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), .Label = c("Bicycle",
"Bicycle (Yélo)", "Bus", "Car", "Car (Yélo)", "Carpool", "Motorcycle/scooter",
"On foot", "Scooter (trottinette)", "Train"), class = "factor"),
value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), Freq = c(1634L, 2143L, 1781L,
1532L, 2281L, 2202L, 2267L, 1331L, 2265L, 2172L, 655L, 146L,
508L, 757L, 8L, 87L, 22L, 958L, 24L, 117L)), class = "data.frame", row.names = c(NA,
-20L))
enter image description here

Select observations in R based on maximum number listed in a column

I hope I've done this correctly! I have two data frames:
teachers = structure(list(Teacher = c(123L, 123L, 123L, 123L, 124L),
tStudents = c(3L, 3L, 4L, 3L, 4L), Term = c(1801L, 1802L, 1801L, 1803L, 1802L),
Course = structure(c(5L, 6L, 7L, 6L, 8L), .Label = c("ENGG",
"ENGG2", "LITT", "LITT2", "MATH", "MATH2", "PHYS", "SCIE"
), class = "factor")), .Names = c("Teacher", "tStudents", "Term", "Course"), row.names = c(NA, 5L), class = "data.frame")
enrols = structure(list(UniqueStudent = structure(c(3L, 2L, 1L, 5L, 4L),
.Label = c("1801-ENGG-N1-abcd1#abc.edu.au", "1801-MATH-C1-abcd1#abc.edu.au","1801-PHYS-L1-abcd1#abc.edu.au", "1802-MATH2-G1-abcd1#abc.edu.au", "1802-SCIE-K2-abcd1#abc.edu.au"), class = "factor"), Term = c(1801L,1801L, 1801L, 1802L, 1802L), Student.Email.Addresses = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "abcd1#abc.edu.au", class = "factor"), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "s12344", class = "factor"),
Gender.Description = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "M", class = "factor"),
Age = c(12L, 12L, 12L, 12L, 12L), Program.Short.Description = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "LSC1", class = "factor"), Term.CC.CN = structure(c(3L,
2L, 1L, 5L, 4L), .Label = c("1801-ENGG-N1", "1801-MATH-C1",
"1801-PHYS-L1", "1802-MATH2-G1", "1802-SCIE-K2"), class = "factor"),
Course.Code = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("ENGG",
"MATH", "MATH2", "PHYS", "SCIE"), class = "factor"), Class.Number = structure(c(4L,
1L, 5L, 3L, 2L), .Label = c("C1", "G1", "K2", "L1", "N1"), class = "factor"),
Teacher = c(123L, 123L, 125L, 124L, 123L)), .Names = c("UniqueStudent", "Term", "Student.Email.Addresses", "ID", "Gender.Description", "Age", "Program.Short.Description", "Term.CC.CN", "Course.Code", "Class.Number", "Teacher"), row.names = c(NA, 5L), class = "data.frame")
teachers$tStudents lists the maximum number of students allowed to be allocated to a teacher per Term and Course. I've also pre-merged the Course enrolments in the "enrols" data to list the Teachers for each course.
So, what I need to do is create class lists from the enrols data using the teachers data by c("teacher", "Term", "Course") but my class lists can only select a maximum value of students based on the number listed in teachers$tStudents. Ideally, I'd also like to select a representative distribution of students so that the new class lists have both genders, different ages and are from different Program.Short.Description.
I've tried merging in different ways in dplyr and can create full lists with all students but haven't been able to use the teachers$tStudents column to limit the number of observations to select. Is this possible?

1 bar in wrong order in stacked bar chart (R ggplot2)

I made a stacked bar chart, however the segments of the 3rd bar are in a different order; the segment that should be on the bottom is now on the top, see image:
When I change the order of the x axis, it does exactly the same with the 3rd bar. How can I solve this? Something wrong with the code?
library('ggplot2')
bar <- ggplot(data.location, aes(Location, value, fill=variable))
bar + stat_summary(fun.y=mean, geom="bar", position="stack")+labs(x="Location", y="value", fill="variable")
Added: output from dput(data.location)
dput(data.location[data.location$Location %in% c('BRM', 'CG', 'DDO'),])
structure(list(Location = structure(c(2L, 3L, 5L, 2L, 3L, 5L,
2L, 3L, 5L, 2L, 3L, 5L), .Label = c("BA", "BRM", "CG", "CH",
"DDO", "DR", "FB", "GG", "GI", "GQS", "HC", "HS", "LL1", "LL2",
"MOW", "PP", "TP", "TR", "TRD", "WB"), class = "factor"), Zone = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), variable = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("A", "B", "C", "D"
), class = "factor"), value = c(425.810194245673, 815.265455416096,
735.274721619422, 997.041922511793, 2147.03610300279, 1210.08829970945,
0, 177.692085889937, 173.266014794846, 406.498315338813, 1293.35105648741,
234.022025228589)), .Names = c("Location", "Zone", "variable",
"value"), row.names = c(2L, 3L, 5L, 22L, 23L, 25L, 42L, 43L,
45L, 62L, 63L, 65L), class = "data.frame")​
Using the following code, I get consistent order of the bars.
ggplot(data.location,aes(Location,value,fill=variable,order=variable))+
geom_bar(stat="identity")

How to find the mean of a continuous variable for each categorical variable

I'd like to plot continuous BMI on the y-axis and the categorical variable for Family Income on the x-axis and I'd like the graph to plot the mean BMI for each category. However, I am not sure how to find the mean BMI for each factor of Family Income.
Dataset nh (5994 total IDs with Observations) (Parts of the 2009-2010 NHANES Dataset)
> dput(head(nh))
structure(list(SeqN = c(51624L, 51628L, 51629L, 51630L, 51633L, 51635L),
Gender = c(1L, 2L, 1L, 2L, 1L, 1L), Age = c(34L, 60L, 26L, 49L, 80L, 80L),
Ethnicity = c(3L, 4L, 1L, 3L, 3L, 3L), FamSize = c(4L, 2L, 5L, 3L, 2L, 1L),
RatioIncomePoverty = c(1.36, 0.69, 1.01, 1.91, 1.27, 1.69),
MECWgt2 = c(81528.77201, 21000.33872, 22633.58187, 74112.48684, 12381.11532, 22502.50666),
BMI = c(32.22, 42.39, 32.61, 30.57, 26.04, 27.62),
LengthUS = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,NA_integer_),
Education = c(3L, 3L, 2L, 4L, 4L, 2L), LocationBorn = c(1L, 1L, 1L, 1L, 1L, 1L),
FamIncome = c(6L, 3L, 6L, 7L, 4L, 4L)), .Names = c("SeqN",
"Gender", "Age", "Ethnicity", "FamSize", "RatioIncomePoverty",
"MECWgt2", "BMI", "LengthUS", "Education", "LocationBorn", "FamIncome"),
row.names = c(NA, 6L), class = "data.frame")
faminc <- as.character(nhanes$FamIncome)
faminc
Any suggestions as to how to model the data to achieve this goal would be appreciated.
Here's a base solution using aggregate:
a <- aggregate(BMI ~ FamIncome, data=nh, FUN=mean)
barplot(a$BMI, names.arg=a$FamIncome)
This may work:
library(plyr)
nhh<-ddply(nh,.(famIncome), summarise, mean.bmi=mean(bmi)) # find mean bmi
with(nhh, plot(famIncome,mean.bmi)) # simple plot

Resources