Why are there NA values produced when using pivot_wider? - r

I'm trying to use pivot wider to create multiple columns/variables containing values, but I NAs in columns I shouldn't.
Here is a representative sample of the data:
df <- structure(list(Condition = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Retraction1",
"Retraction2"), class = "factor"), First = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), Second = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), Third = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), Fourth = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101"), class = "factor"), Scenario = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 2L, 3L, 4L), .Label = c("J", "P", "R",
"S"), class = "factor"), Estimate = structure(c(4L, 8L, 7L, 11L,
9L, 12L, 10L, 2L, 5L, 6L, 4L, 7L, 11L, 9L, 12L, 10L, 2L, 3L,
5L, 6L, 4L, 8L, 7L, 11L, 9L, 12L, 10L, 2L, 5L, 6L, 4L, 8L, 7L,
11L, 9L, 12L, 10L, 2L, 5L, 6L, 1L, 1L, 1L, 1L), .Label = c("CompMean",
"P.H.Reps.", "P.H.Reps..1", "P.Rel.", "P.Rel1.Reps.", "P.Rel2.Reps.",
"P.Rep1.nH.nRel.", "P.Rep1.nH.Rel.", "P.Rep2.nH.nRel.nRep1.",
"P.Rep2.nH.nRel.Rep1.", "P.Rep2.nH.Rel.nRep1.", "P.Rep2.nH.Rel.Rep1."
), class = "factor"), value = c(90L, 8L, 82L, 11L, 82L, 11L,
82L, 100L, 99L, NA, 62L, 11L, 91L, 12L, 91L, 5L, 82L, 91L, 80L,
NA, 92L, 12L, 61L, 18L, 90L, 21L, 81L, 96L, 92L, NA, 91L, 10L,
72L, 22L, 62L, 21L, 73L, 99L, 98L, NA, 7L, 7L, 7L, 7L)), row.names = c(NA,
-44L), class = c("tbl_df", "tbl", "data.frame"))
head(df)
This is data from one subject. There should only be NAs in the P.Rel2.Reps. and no other.
However, there are NAs in some of the other columns when I use pivot wider like so:
pivot_wider(df, names_from = Estimate, values_from = value)
Here is an example of how the data look after pivoting wider.
df2 <- structure(list(Condition = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Control", "Retraction1", "Retraction2"
), class = "factor"), First = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Journalist", "Police", "Reviewer",
"Spokesperson"), class = "factor"), Second = structure(c(3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), Third = structure(c(1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), Fourth = structure(c(4L,
4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Journalist",
"Police", "Reviewer", "Spokesperson"), class = "factor"), ID = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
"60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70",
"71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81",
"82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92",
"93", "94", "95", "96", "97", "98", "99", "100", "101"), class = "factor"),
Scenario = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L), .Label = c("J", "P", "R", "S"), class = "factor"), P.Rel. = c(90L,
62L, 92L, 91L, 57L, 81L, 71L, 80L, 40L, 75L), P.Rep1.nH.Rel. = c(8L,
NA, 12L, 10L, 31L, NA, 19L, 17L, 25L, NA), P.Rep1.nH.nRel. = c(82L,
11L, 61L, 72L, 89L, 15L, 79L, 84L, 76L, 25L), P.Rep2.nH.Rel.nRep1. = c(11L,
91L, 18L, 22L, 35L, 64L, 30L, 22L, 25L, 50L), P.Rep2.nH.nRel.nRep1. = c(82L,
12L, 90L, 62L, 62L, 13L, 45L, 53L, 25L, 50L), P.Rep2.nH.Rel.Rep1. = c(11L,
91L, 21L, 21L, 15L, 52L, 9L, 10L, 100L, 50L), P.Rep2.nH.nRel.Rep1. = c(82L,
5L, 81L, 73L, 67L, 22L, 60L, 61L, 100L, 25L), P.H.Reps. = c(100L,
82L, 96L, 99L, 81L, 40L, 71L, 76L, 75L, 90L), P.Rel1.Reps. = c(99L,
80L, 92L, 98L, 81L, 80L, 89L, 79L, 75L, 76L), P.Rel2.Reps. = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), P.H.Reps..1 = c(NA,
91L, NA, NA, NA, 80L, NA, NA, NA, 100L), CompMean = c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 6L, 4L, 7L)), row.names = c(NA, -10L
), class = c("tbl_df", "tbl", "data.frame"))
head(df2)
I have seen there is a similar post on this topic but it doesn't answer why NAs are being produced in my situation.
Do I need to add some other argument?

Looking at the data it looks like you have some corrupted data at one place. You can correct it by
df$Estimate <- replace(df$Estimate, df$Estimate == "P.H.Reps..1", "P.Rep1.nH.Rel.")
and then use pivot_wider which will give you NA only in column i.e P.Rel2.Reps.
tidyr::pivot_wider(df, names_from = Estimate, values_from = value)

NA values will result for any combination of categories for the new pivoted columns that aren't present in the original long data frame. For example, let's look at the rows of the long data frame with Estimate=="P.Rep1.nH.Rel.":
df %>% filter(Estimate=="P.Rep1.nH.Rel.")
Condition First Second Third Fourth ID Scenario Estimate value
1 Control Police Reviewer Journalist Spokesperson 1 J P.Rep1.nH.Rel. 8
2 Control Police Reviewer Journalist Spokesperson 1 R P.Rep1.nH.Rel. 12
3 Control Police Reviewer Journalist Spokesperson 1 S P.Rep1.nH.Rel. 10
Now look at the results of pivot_wider (I've kept only the relevant columns for brevity). Note in the output below that there's a missing value in the P.Rep1.nH.Rel. column. The missing value occurs when Scenario=="P" because the long data frame doesn't have a row for P.Rep1.nH.Rel. with Scenario=="P" resulting in a missing value in the wide data frame. Missing values are occurring in the P.H.Reps..1 column for a similar reason, as there's only one row with Estimate=="P.H.Reps..1 in the long data frame and it has Scenario=="P". Thus, the values are missing for the other three scenarios.
pivot_wider(df, names_from = Estimate, values_from = value) %>%
select(Condition:Scenario, P.Rep1.nH.Rel., P.H.Reps..1)
Condition First Second Third Fourth ID Scenario P.Rep1.nH.Rel. P.H.Reps..1
1 Control Police Reviewer Journalist Spokesperson 1 J 8 NA
2 Control Police Reviewer Journalist Spokesperson 1 P NA 91
3 Control Police Reviewer Journalist Spokesperson 1 R 12 NA
4 Control Police Reviewer Journalist Spokesperson 1 S 10 NA
This may be a data error, as suggested by #RonakShah, but if the data are correct then the NA values will naturally result when pivoting to wide format. You can fill the missing values with some other value by adding the argument values_fill=list(value=0) to pivot_wider (you can of course use any fill value you wish; I've just used 0 for illustration). Note that even if you use the values_fill argument, explicit missing values in the original long data will still be preserved in the wide data frame. Only missing values that result from the pivoting operation will be filled with a different value.

Related

Why are these means different when computed by dplyr mutate vs summarize in group_by?

My dataframe contains:
a column deceased on which I compute aggregated means later on (mortality ratios, by gender)
a weighting variable n.group
a categorical sex (1: female, 2: male)
I don't understand why the means and weighted-means m.mortf, w.mortf are wrong when calculated below in one single mutate/summarize expression.
Dataframe:
red11 <- structure(list(hosptg = structure(c(3L, 3L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L,
3L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor"), quarter.adm = structure(c(4L, 11L,
3L, 12L, 7L, 8L, 12L, 9L, 1L, 11L, 7L, 1L, 2L, 2L, 10L, 10L,
8L, 11L, 6L, 1L, 4L, 6L, 10L, 10L, 6L, 11L, 11L, 7L, 3L, 6L,
10L, 12L, 7L, 6L, 6L, 3L, 6L, 12L, 4L, 4L, 12L, 1L, 6L, 5L, 11L,
9L, 4L, 4L, 3L, 10L, 4L, 8L, 10L, 3L, 7L, 1L, 12L, 5L, 4L, 6L,
6L, 3L, 9L, 7L, 8L, 3L, 7L, 8L, 7L, 6L, 5L, 11L, 9L, 11L, 1L,
4L, 6L, 5L, 5L, 6L, 5L, 5L, 11L, 3L, 4L, 12L, 12L, 1L, 9L, 9L,
6L, 9L, 1L, 4L, 8L, 1L, 5L, 2L, 9L, 11L), .Label = c("2011Q1",
"2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3", "2012Q4",
"2013Q1", "2013Q2", "2013Q3", "2013Q4"), class = "factor"), g.mdc = c("08",
"05", "09", "08", "14", "15", "15", "11", "09", "01", "08", "11",
"16", "14", "08", "06", "08", "06", "06", "08", "15", "14", "14",
"08", "11", "09", "08", "08", "06", "06", "06", "08", "03", "05",
"05", "15", "02", "05", "08", "04", "04", "10", "06", "01", "08",
"05", "03", "06", "01", "01", "06", "08", "08", "04", "12", "05",
"01", "15", "08", "01", "08", "01", "05", "15", "15", "01", "06",
"15", "01", "08", "01", "05", "08", "02", "15", "03", "06", "05",
"05", "03", "09", "08", "11", "12", "06", "04", "08", "01", "06",
"01", "08", "06", "15", "05", "08", "07", "08", "13", "08", "08"
), sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
age = c(23L, 83L, 51L, 54L, 37L, 0L, 0L, 82L, 45L, 88L, 84L,
58L, 41L, 33L, 71L, 79L, 67L, 42L, 73L, 66L, 0L, 26L, 38L,
65L, 31L, 87L, 38L, 38L, 77L, 44L, 54L, 74L, 38L, 70L, 44L,
0L, 78L, 65L, 56L, 85L, 70L, 83L, 89L, 46L, 39L, 34L, 5L,
85L, 18L, 5L, 41L, 73L, 18L, 41L, 75L, 77L, 36L, 0L, 84L,
83L, 58L, 93L, 83L, 0L, 0L, 2L, 49L, 0L, 55L, 46L, 40L, 81L,
60L, 51L, 0L, 22L, 78L, 69L, 75L, 65L, 31L, 15L, 79L, 87L,
72L, 78L, 48L, 16L, 81L, 63L, 84L, 17L, 0L, 60L, 60L, 74L,
44L, 44L, 53L, 71L), deceased = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
n.group = c(3L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L,
2L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 1L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L)), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("hosptg", "quarter.adm", "g.mdc",
"sex", "age", "deceased", "n.group"), row.names = c(NA, -100L
))
Grouping - using mutate:
red111 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
mutate(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
Grouping - using summarize (i.e. aggregation):
red211 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
I would have expected the ratio being the same and foremost keeping the initial mean. I understand what the aggregation does this is also illustrated by the sum(redxx$n) but I struggle comprehending the full background.
Initial data frame mean:
mean(red11$deceased == 1, na.rm=T) [1] 0.02
Mutate mean and sum:
sum(red211$n) [1] 170
> mean(red111$female) [1] 0.52
> mean(red111$w.mortf) [1] 0.02
> mean(red111$m.mortf) [1] 0.02
Summarized mean and sum:
sum(red211$n) [1] 100
mean(red211$female) [1] 0.4977169
mean(red211$w.mortf) [1] 0.02739726
mean(red211$m.mortf) [1] 0.02739726
What I would like to have is an aggregated data frame (i.e. reduced number of lines) maintaining the initial mean throughout. And, why does the weighting variable not compensate for it?
EDIT:
My basic intention is that I am using a big data file where I have single entries where a case may be deceased. Then I calculate mortality ratios. But this can logically only be done at aggregated level. That is why I create a data frame like red211. Thereafter I base my regression models on it. But them again means are based on that second data frame and not the original values. Thus my results are distorted in size. That is why I am "desperately" looking for a solution that will get me closer to my original mean values. I hope this helps.
The model I use is a straight forward difference in difference:
lm(w.mortf ~ treatment * year, data = red)
where: treatment is the treatment group / year the intervention year / red the aggregated data frame
===========================================================
w.mortf m.mortf
-----------------------------------------------------------
(Intercept) 0.037 (0.001) *** 0.037 (0.001) ***
year 0.003 (0.001) * 0.003 (0.001) *
tg1 -0.003 (0.001) * -0.003 (0.001) *
year:tg1 -0.001 (0.002) -0.001 (0.002)
-----------------------------------------------------------
Adj. R^2 0.000 0.000
Num. obs. 126031 126031
RMSE 0.172 0.179
===========================================================
The original data frame mean is approx. 0.018 - thus I think to far away from being interpretable - or where I am misled?
The figure below illustrates the issue. Where 2012Q1 should be the reference value findable based on the above regression.
You have to apply the weighting after aggregation:
red311 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T))
weighted.mean(red311$female, red311$n)
#> [1] 0.52
weighted.mean(red311$m.mortf, red311$n)
#> [1] 0.02
Edit: If the (unweighted) averages in red311 would correspond to the averages in red11, then the values in red311would be pretty meaningless. One can see this by going through the math or from a simple example:
suppressPackageStartupMessages(library(dplyr))
df <- data.frame(key = c('a', 'b', 'b', 'b'), value = 1:4, stringsAsFactors = FALSE)
df
#> key value
#> 1 a 1
#> 2 b 2
#> 3 b 3
#> 4 b 4
mean(df$value)
#> [1] 2.5
df1 <- df %>%
group_by(key) %>%
summarize(n = n(), value = mean(value)) %>%
ungroup() %>%
mutate(weighted = value * n * n() / sum(n))
df1
#> # A tibble: 2 x 4
#> key n value weighted
#> <chr> <int> <dbl> <dbl>
#> 1 a 1 1.00 0.500
#> 2 b 3 3.00 4.50
mean(df1$value)
#> [1] 2
mean(df1$weighted)
#> [1] 2.5
weighted.mean(df1$value, df1$n)
#> [1] 2.5
So while it is possible to introduce the weighted column with average equal to the original average, the values in there are pretty meaningless from my point of view.
Edit 2: The re-weighting schema used above is general and can also be applied to the original data:
red411 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T)) %>%
ungroup() %>%
mutate(w.mortf = m.mortf * n * n() / sum(n))
mean(red411$w.mortf)
#> [1] 0.02
However, I am unsure how to interpret w.mortf.

Separating ggplot using rectangles in the background

Here is an image of my plot so far. At the end of the post I provide the code to reproduce it.
For the time being i use horizontal lines to separate the four groups of lines (defined by variable de in the dataframe). But I would like to use colored rectangles in the background of each group. See the following image to get an idea.
I tried geom_rect and geom_tile with no success. Could anybody help me?
mdfr<-structure(list(name = structure(c(13L, 13L, 13L, 14L, 14L, 14L,
1L, 1L, 1L, 10L, 10L, 10L, 7L, 7L, 7L, 2L, 2L, 2L, 15L, 15L,
15L, 8L, 8L, 8L, 11L, 11L, 11L, 16L, 16L, 16L, 4L, 4L, 4L, 12L,
12L, 12L, 9L, 9L, 9L, 17L, 17L, 17L, 5L, 5L, 5L, 6L, 6L, 6L,
3L, 3L, 3L, 13L, 13L, 13L, 14L, 14L, 14L, 1L, 1L, 1L, 10L, 10L,
10L, 7L, 7L, 7L, 2L, 2L, 2L, 15L, 15L, 15L, 8L, 8L, 8L, 11L,
11L, 11L, 16L, 16L, 16L, 4L, 4L, 4L, 12L, 12L, 12L, 9L, 9L, 9L,
17L, 17L, 17L, 5L, 5L, 5L, 6L, 6L, 6L, 3L, 3L, 3L, 13L, 13L,
14L, 14L, 1L, 1L, 10L, 10L, 7L, 7L, 2L, 2L, 15L, 15L, 8L, 8L,
11L, 11L, 16L, 16L, 4L, 4L, 12L, 12L, 9L, 9L, 17L, 17L, 5L, 5L,
6L, 6L, 3L, 3L), .Label = c("10012/06", "541/13", "700-1/15",
"700/13", "737/13", "751/15", "512/12", "579/13", "715/14", "458/07",
"635/13", "705/13, \n705-1/15", "10004/07", "10005/07", "563/09",
"698/16", "717/14"), class = "factor"), Contr.finish = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Initial", "Current",
"Forecast", "Cost"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("start_date", "end_date"
), class = "factor"), value = c("2007-05-30", "2009-03-30", "2016-06-29",
"2007-09-05", "2010-03-05", "2017-09-30", "2006-09-26", "2008-09-26",
"2015-08-31", "2007-11-20", "2011-11-20", "2014-03-20", "2012-01-31",
"2014-07-31", "2016-03-20", "2013-06-21", "2016-06-21", "2016-06-21",
"2009-04-15", "2011-04-15", "2017-12-31", "2013-06-21", "2016-06-21",
"2016-06-21", "2013-12-18", "2016-08-18", "2017-08-18", "2016-04-14",
"2018-02-14", "2018-02-14", "2013-06-03", "2014-10-03", "2016-05-10",
"2013-08-07", "2015-02-07", "2016-06-30", "2014-09-11", "2016-09-11",
"2016-09-11", "2014-09-26", "2016-09-26", "2016-09-26", "2013-03-20",
"2016-03-20", "2016-03-20", "2015-10-09", "2016-08-09", "2016-08-09",
"2015-11-10", "2016-05-10", "2016-05-10", "2009-03-30", "2016-06-29",
"2016-06-29", "2010-03-05", "2017-09-30", "2017-09-30", "2008-09-26",
"2015-08-31", "2016-08-31", "2011-11-20", "2014-03-20", "2015-12-31",
"2014-07-31", "2016-03-20", "2016-12-20", "2016-06-21", "2016-06-21",
"2016-12-30", "2011-04-15", "2017-12-31", "2017-12-31", "2016-06-21",
"2016-06-21", "2018-03-31", "2016-08-18", "2017-08-18", "2018-02-28",
"2018-02-14", "2018-02-14", "2018-02-14", "2014-10-03", "2016-05-10",
"2016-05-10", "2015-02-07", "2016-06-30", "2016-06-30", "2016-09-11",
"2016-09-11", "2017-07-28", "2016-09-26", "2016-09-26", "2016-09-26",
"2016-03-20", "2016-03-20", "2018-10-19", "2016-08-09", "2016-08-09",
"2016-08-09", "2016-05-10", "2016-05-10", "2016-05-10", "2007-05-30",
"2013-09-24", "2007-09-05", "2010-10-21", "2006-09-26", "2016-08-02",
"2007-11-20", "2015-10-19", "2012-01-31", "2015-11-23", "2013-06-21",
"2015-06-09", "2009-04-15", "2014-05-06", "2013-06-21", "2015-03-28",
"2013-12-18", "2015-05-24", "2016-04-14", "2016-04-14", "2013-06-03",
"2016-01-07", "2013-08-07", "2015-12-08", "2014-09-11", "2015-07-24",
"2014-09-26", "2015-06-18", "2013-03-20", "2017-02-22", "2015-10-09",
"2015-10-09", "2015-11-10", "2016-01-06"), bar = c(5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2.5, 2.5, 2.5, 2.5,
2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5,
2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5,
2.5, 2.5, 2.5, 2.5), de = structure(c(4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 4L, 4L, 4L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L,
4L, 4L, 4L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 4L,
4L, 4L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 3L, 3L,
3L, 2L, 2L, 2L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 4L, 4L, 2L, 2L,
3L, 3L, 4L, 4L, 1L, 1L, 3L, 3L, 2L, 2L, 4L, 4L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("de1", "de2", "de3", "de4"), class = "factor")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101", "102", "110", "410", "710", "103", "131", "161", "191",
"221", "251", "281", "311", "341", "371", "401", "431", "461",
"491", "521", "551", "581", "611", "641", "671", "701", "731",
"761", "791", "821", "851", "881", "911", "941", "971", "1001"
), .Names = c("name", "Contr.finish", "variable", "value", "bar",
"de"), class = "data.frame")
dfr<-structure(list(name = structure(c(2L, 4L, 3L, 1L), .Label = c("10004/07",
"10012/06", "458/07", "512/12"), class = "factor"), text = c("Region 1",
"Region 2", "Region 3", "Region 4"), name0 = c(0, 6.5, 9.5, 12.5
)), .Names = c("name", "text", "name0"), row.names = c(NA, -4L
), class = "data.frame")
library(ggplot2)
library(scales)
library(ggthemes)
ggplot(mdfr, aes(as.POSIXct(as.Date(value, "%Y-%m-%d")), name, colour = Contr.finish)) +
geom_line(aes(size=bar)) +
guides(colour = guide_legend(override.aes = list(size=5)), size="none", fill="none") +
geom_line(size=2.0) +
xlab("") + ylab("") +
theme_stata() +
geom_hline(data=dfr, aes(yintercept = name0), color = "#4d4d4d", size=0.8) + #
scale_fill_brewer(palette="Dark2") +
scale_x_datetime(breaks = date_breaks("1 year"),labels = abbreviate) +
scale_colour_manual(values=c("Initial" = "#67bf5c", "Current" = "#1f77b4",
"Forecast" = "#ff9e4a", "Cost" = "#c10534")) +
theme(legend.position = "bottom",
axis.text.y=element_text(angle=0)
)
You can use geom_rect() and there set xmin= and xmax= to minimal and maximal values of your dates or some other values outside the limits. For the ymin= and ymax= used name values converted to numeric (they have to factors in your dataframe) and then -0.5 and +0.5 (as for each discrete value there is place of 1 around it). Added expand=c(0,0) to scale_x_datetime() to remove white areas.
+ geom_rect(aes(xmin=min(as.POSIXct(as.Date(value, "%Y-%m-%d"))),
xmax=max(as.POSIXct(as.Date(value, "%Y-%m-%d"))),
ymin=as.numeric(name)-0.5,ymax=as.numeric(name)+0.5,
fill=de),alpha=0.05,linetype=0)

R - Convert List of Lists into single dataframe

So, I have created a list (and a single column matrix) that contains 256 nested lists. What I would like to do, is to convert each of the 256 lists into a single dataframe of 16 columns and then write.table it. Although each list contains the same number of columns (16), the number of rows for each list varies. I have tried to use unlist unsuccessfully because the changing row counts. I can subset each list individually, so I know there's an easier way to do the whole list.
I'm pretty new to R, so I apologize for asking what may be a naive novice question. I searched through a lot of topics the last couple days and didn't see anything that seemed to match my problem. for loop seems like it might be unnecessary and I wasn't sure if lapply was the correct route, either.
UPDATE: dput of first list:
list(structure(list(structure(c(2L, 11L, 15L, 8L, 7L, 3L, 6L, 10L,
1L, 1L, 18L, 13L, 14L, 19L, 16L, 17L, 4L, 5L, 9L, 12L), .Label = c("",
"Aaron Rodgers", "Andrew Quarless", "Derrick Coleman", "Doug Baldwin",
"DuJuan Harris", "Eddie Lacy", "James Starks", "Jermaine Kearse",
"John Kuhn", "Jordy Nelson", "Luke Willson", "Marshawn Lynch", "Percy
Harvin", "Randall Cobb", "Ricardo Lockette", "Robert Turbin",
"Russell Wilson", "Zach Miller"), class = "factor"), Tm =
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("GNB", "Passing", "SEA", "Tm"),
class = "factor"), Cmp = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "19",
"23", "Cmp", "Rushing"), class = "factor"), Att = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "28", "33", "Att", "Receiving"
), class = "factor"), Yds = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, NA, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "189", "191", "Yds"), class = "factor"),
TD = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "1",
"2", "TD"), class = "factor"), Int = structure(c(3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, NA, 4L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "0", "1", "Int"), class = "factor"),
Lng = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "23",
"33", "Lng"), class = "factor"), Att = structure(c(1L, 1L,
1L, 7L, 3L, 1L, 2L, 2L, NA, 8L, 7L, 4L, 5L, 1L, 1L, 6L, 1L,
1L, 1L, 1L), .Label = c("", "1", "12", "20", "4", "6", "7",
"Att"), class = "factor"), Yds = structure(c(1L, 1L, 1L,
7L, 6L, 1L, 9L, 3L, NA, 10L, 5L, 2L, 8L, 1L, 1L, 4L, 1L,
1L, 1L, 1L), .Label = c("", "110", "2", "27", "29", "34",
"37", "41", "7", "Yds"), class = "factor"), TD = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 2L, 3L, NA, 5L, 2L, 4L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("", "0", "1", "2", "TD"), class = "factor"),
Lng = structure(c(1L, 1L, 1L, 2L, 4L, 1L, 8L, 6L, NA, 9L,
3L, 7L, 5L, 1L, 1L, 8L, 1L, 1L, 1L, 1L), .Label = c("", "12",
"13", "15", "16", "2", "21", "7", "Lng"), class = "factor"),
Rec = structure(c(1L, 7L, 5L, 3L, 4L, 4L, 1L, 1L, NA, 8L,
1L, 2L, 6L, 4L, 3L, 1L, 2L, 4L, 2L, 2L), .Label = c("", "1",
"2", "3", "6", "7", "9", "Rec"), class = "factor"), Yds = structure(c(1L,
12L, 9L, 3L, 3L, 6L, 1L, 1L, NA, 13L, 1L, 4L, 10L, 8L, 7L,
1L, 5L, 4L, 11L, 2L), .Label = c("", "1", "11", "14", "15",
"26", "38", "42", "58", "59", "8", "83", "Yds"), class = "factor"),
TD = structure(c(1L, 2L, 3L, 2L, 2L, 2L, 1L, 1L, NA, 4L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L), .Label = c("", "0",
"1", "TD"), class = "factor"), Lng = structure(c(1L, 7L,
9L, 3L, 4L, 8L, 1L, 1L, NA, 14L, 1L, 5L, 11L, 10L, 11L, 1L,
6L, 12L, 13L, 2L), .Label = c("", "1", "11", "12", "14",
"15", "16", "18", "23", "24", "33", "6", "8", "Lng"), class = "factor")), .Names = c("", "Tm", "Cmp", "Att", "Yds", "TD", "Int",
"Lng", "Att", "Yds", "TD", "Lng", "Rec", "Yds", "TD", "Lng"),
row.names = c(NA, -20L ), class = "data.frame"))
So, each observation in my list is like this above and I want to convert all of the lists into their 16 column(Now that I think about it, it's 17 columns, one is just unnamed) dataframe layout and stack all the rows together in one place that I can then write.table
Let's call your list l where l[[1]] is what you have dput above.
Two easy ways from base R and from data.table
do.call("rbind", l)
data.table::rbindlist(l)
This assumes that the columns match in each list element. Your example doesn't confirm this, although you state it.

R statistics ggplot How to order x-axis of stacked bars from high to low based on a factor level of the stack

Am trying to arrange the order of stacked bars using the following data frame:
DF <- structure(list(Group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L),
.Label = c("1", "2"), class = "factor"),
Response = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), Count = c(531L, 472L, 374L, 326L, 207L,
168L, 76L, 60L, 51L, 43L, 37L, 18L, 6L, 0L, 247L, 149L, 86L,
48L, 45L, 36L, 29L, 14L, 10L, 4L, 3L, 3L, 0L, 0L, 531L, 230L,
173L, 93L, 87L, 76L, 30L, 29L, 29L, 22L, 13L, 3L, 2L, 0L),
Percent = c(85.23, 75.76, 60.03, 52.33, 33.23, 26.97, 12.2, 9.63, 8.19, 6.9, 5.94,
2.89, 0.96, 0, 39.65, 23.92, 13.8, 7.7, 7.22, 5.78, 4.65, 2.25,
1.61, 0.64, 0.48, 0.48, 0, 0, 85.23, 36.92, 27.77, 14.93, 13.96,
12.2, 4.82, 4.65, 4.65, 3.53, 2.09, 0.48, 0.32, 0),
Items = structure(c(4L, 2L, 3L, 5L, 7L, 1L, 4L, 5L, 2L, 3L, 7L, 1L, 6L, 6L, 7L, 1L, 3L,
5L, 2L, 7L, 1L, 4L, 6L, 3L, 2L, 5L, 4L, 6L, 6L, 1L, 5L, 7L, 3L,
6L, 2L, 1L, 3L, 2L, 5L, 7L, 4L, 4L),
.Label = c("A", "B", "C", "D", "E", "F", "G"), class = "factor")),
.Names = c("Group", "Response", "Count", "Percent", "Items"),
row.names = c("18",
"36", "2", "8", "24", "42", "17", "7", "35", "1", "23", "41",
"54", "53", "26", "44", "4", "10", "38", "25", "43", "20", "56",
"3", "37", "9", "19", "55", "58", "46", "12", "28", "6", "57",
"40", "45", "5", "39", "11", "27", "22", "21"), class = "data.frame")
library(ggplot2)
cPalette = c("#F8766D","#619CFF","#00BA38")
ggplot(DF, aes(x = Items, y = Percent, fill = Response, order = Response )) +
geom_bar(stat = "identity" ) +
labs(title="Acceptance",
x ="Items", y = "Percent") +
scale_fill_manual(values=cPalette,
breaks=c("1","2","3"),
labels=c("Negative", "Neutral","Positive") ) +
theme(text = element_text(size=15, color = "blue", face = "bold"),
axis.text.x = element_text(color = "black", face = "bold"),
axis.text.y = element_text(color = "black", face = "bold" ))
I would like to have the bars on the x-axis going from left to right:
D,B,C,E,G,A,F instead of alphabetically A,B,C,D,E,F,G
This should not be ordered manually because it depends on %age contribution
of the factor within the stack.
Extensive search of SO gave me this link which comes closest to what I wish to achieve.
How does one get the bars to display left to right in decreasing order of negative Responses? I am stuck with this problem last couple of days.
You can order the levels of Items by the percentage, then plot
## Order the Items by %Response == 1
agg <- aggregate(Percent ~ Items, data=DF[DF$Response==1,], sum)
DF$Items <- factor(DF$Items, levels=agg[order(agg$Percent, decreasing = T), "Items"])
## The plot as you had it
cPalette = c("#F8766D","#619CFF","#00BA38")
ggplot(DF, aes(x = Items, y = Percent, fill = Response, order = Response )) +
geom_bar(stat = "identity" ) +
labs(title="Acceptance",
x ="Items", y = "Percent") +
scale_fill_manual(values=cPalette,
breaks=c("1","2","3"),
labels=c("Negative", "Neutral","Positive") ) +
theme(text = element_text(size=15, color = "blue", face = "bold"),
axis.text.x = element_text(color = "black", face = "bold"),
axis.text.y = element_text(color = "black", face = "bold" ))

Conditionally subsetting a list of dataframes in R

I have a list of dataframes called myList (see sample below) and all I want is to subset that list of dataframes by the condition that only rows with a "pointNum" > 100 are included in the new list. Should be easy but I just can't get it to work. So the output should look like this for the first item on the list:
[[1]]
study Identi locDate locNumb meanLat meanLon pointNum
5 study 1 SDU101 2011-07-13 49 32.8837771221667 -117.24038866075 120
9 study 1 SDU101 2011-07-13 60 32.8838778530086 -117.240522195673 349
11 study 1 SDU101 2011-07-13 321 32.8027296698536 -117.210527201581 683
I've been trying to get this to work, and other similar subsetting options. It currently runs but doesn't do anything:
newList = lapply(myList, function(x) { subset(x, "pointNum" > 2)} )
I know that similar questions have been posted, but I couldn't get any of those solutions to to work for my particular problem. Any help would be greatly appreciated.
myList <- list(structure(list(study = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Study 1", class = "factor"),
Identi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "SDU101", class = "factor"),
locDate = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "2011-07-13", class = "factor"),
locNumb = structure(c(12L, 15L, 1L, 2L, 8L, 9L, 10L, 11L,
13L, 14L, 3L, 4L, 5L, 6L, 7L), .Label = c("10", "11", "321",
"323", "324", "326", "329", "49", "56", "57", "59", "6",
"60", "61", "7"), class = "factor"), meanLat = structure(c(11L,
10L, 4L, 9L, 6L, 8L, 3L, 5L, 7L, 12L, 1L, 15L, 13L, 14L,
2L), .Label = c("32.8027296698536", "32.802755201875", "32.883244695",
"32.8835599674286", "32.8837003266667", "32.8837771221667",
"32.8838778530086", "32.88411147", "32.88419565", "32.8841969254545",
"32.884720435", "32.8853723146154", "32.8853777533333", "32.8854051",
"32.9164754136842"), class = "factor"), meanLon = structure(c(13L,
10L, 12L, 15L, 9L, 8L, 7L, 4L, 11L, 6L, 2L, 3L, 14L, 5L,
1L), .Label = c("-117.210382870833", "-117.210527201581",
"-117.236141991053", "-117.239834913333", "-117.23989078",
"-117.240133633077", "-117.240140015", "-117.24022087", "-117.24038866075",
"-117.240416713636", "-117.240522195673", "-117.240532619714",
"-117.24062533", "-117.24063566", "-117.24070002"), class = "factor"),
pointNum = structure(c(6L, 2L, 9L, 1L, 3L, 1L, 6L, 7L, 8L,
4L, 11L, 5L, 7L, 1L, 10L), .Label = c("1", "11", "120", "13",
"19", "2", "3", "349", "35", "48", "683"), class = "factor")), .Names = c("study",
"Identi", "locDate", "locNumb", "meanLat", "meanLon", "pointNum"
), row.names = c(NA, -15L), class = "data.frame"), structure(list(
study = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "Study 1", class = "factor"),
Identi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "SDU111", class = "factor"),
locDate = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "2011-07-12", class = "factor"),
locNumb = structure(c(14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L), .Label = c("354", "418", "419",
"420", "421", "422", "425", "426", "427", "428", "429", "430",
"432", "67"), class = "factor"), meanLat = structure(c(2L,
3L, 9L, 1L, 5L, 8L, 13L, 11L, 6L, 7L, 10L, 12L, 14L, 4L), .Label = c("32.8651107616667",
"32.86543857", "32.867004565", "32.868283279", "32.868857725",
"32.869014345", "32.8692111971429", "32.8693627126536", "32.8694241808955",
"32.8694814566667", "32.86955278", "32.8696187847619", "32.8696329253571",
"32.8698972233333"), class = "factor"), meanLon = structure(c(13L,
12L, 8L, 14L, 11L, 2L, 7L, 5L, 4L, 1L, 3L, 9L, 6L, 10L), .Label = c("-117.235456126857",
"-117.235585179972", "-117.235959423333", "-117.25006813",
"-117.25014399", "-117.250450876667", "-117.250467514464",
"-117.25050148", "-117.250773722857", "-117.2512085715",
"-117.25133879", "-117.25283091", "-117.254194355", "-117.254406255417"
), class = "factor"), pointNum = structure(c(2L, 2L, 11L,
5L, 2L, 8L, 9L, 1L, 2L, 7L, 6L, 4L, 10L, 3L), .Label = c("1",
"2", "20", "21", "24", "3", "35", "358", "56", "6", "67"), class = "factor")), .Names = c("study",
"Identi", "locDate", "locNumb", "meanLat", "meanLon", "pointNum"
), row.names = c(NA, -14L), class = "data.frame"))
You have two issues - extra quotes and your pointNum from your dput is a factor, so do this:
lapply(myList, function(x) { subset(x, as.integer(as.character(pointNum)) > 2)} )

Resources