How to remove observations from multiple dataframes and keep as multiple dataframes

How to remove observations from multiple dataframes and keep as multiple dataframes - r

I have many data frames - Here is a simplified version of two of them.
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
I am using lapply to remove some observations, I would like to do this across all my dataframes and keep the output as dataframes, basically update the existing dataframes and remove the observations I select.
Here is my current code.
df.list <- list(flows, returns)
lapply(df.list, function(df) df[!grepl("1", df$Class),])
However, when I do this the output is not updating the original dataframes and is outputting as a list in the global environment. Any help is appreciated.

Another solution:
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
df.list <- list(flows, returns)
Now, we need to assign lapply to some value and name it:
a <- lapply(df.list, function(df) df[!grepl("1", df$Class),])
names(a) <- c("flows","returns")
After this, we call list2env function:
list2env(a, envir = .GlobalEnv)
Output:
> flows
Student Class Jan_18_score Feb_18_score Jan_18_Weight Feb_18_Weight
5 Ed 2 1 2 60 80
6 Mick 2 NA 6 80 30
7 Dave 3 5 NA 40 25
8 Nick 3 8 8 12 45
9 Tim 3 -2 7 23 40
10 George 3 5 3 65 NA
11 Tom 3 NA 8 78 50
> returns
Student Class Jan_20_score Feb_20_score Jan_20_Weight Feb_20_Weight
5 Ed 2 1 2 60 80
6 Mick 2 NA 6 80 30
7 Dave 3 5 NA 40 25
8 Nick 3 8 8 12 45
9 Tim 3 -2 7 23 40
10 George 3 5 3 65 NA
11 Tom 3 NA 8 78 50
Checking classes of the outputs:
> class(returns)
[1] "data.frame"
> class(flows)
[1] "data.frame"

I'm not sure about using lapply but you can work with lists of variables by name using get and assign.
flows <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_18_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_18_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_18_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_18_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
returns <- structure(list(Student = c("Adam", "Char", "Fred", "Greg", "Ed", "Mick", "Dave", "Nick", "Tim", "George", "Tom"),
Class = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), Jan_20_score = c(NA, 5L, -7L, 2L, 1L, NA, 5L, 8L, -2L, 5L, NA),
Feb_20_score = c(2L, 0, 8L, NA, 2L, 6L, NA, 8L, 7L, 3L, 8L), Jan_20_Weight = c(150L, 30L, NA, 80L, 60L, 80L, 40L, 12L, 23L, 65L, 78L),
Feb_20_Weight = c(153L, 60L, 80L, 40L, 80L, 30L, 25L, 45L, 40L, NA, 50L)), class = "data.frame", row.names = c(NA, -11L))
df.list <- list("flows", "returns")
for (df.name in df.list){
temp <- get(df.name)
temp <- temp[!grepl("1", temp$Class), ]
assign(paste0(df.name, "_new"), temp)
}
Remove "_new" to overwrite the original variables.

Related

How to re order datas in ggplot2

I'm trying to re order my datas, I already found the code to use but it doesn't seem to work...
Can you help me please ?
This is my code :
#code for my boxplot
dat.m2 <- melt(H1,id.vars='fusion', measure.vars=c('FF','FM'))
dat.m2 <- melt(H1,id.vars='fusion', measure.vars=c('FF','FM'))
ggplot(dat.m2)+ geom_boxplot(aes(x=fusion, y=value, colour=variable))+ facet_wrap(~H1$Genotype)+
xlab(" ")+ ylab("Days after sowing")
#code tried to re order
levels(dat.m2$fusion)
dat.m2$fusion<-factor(dat.m2$fusion, levels=c("Control", "CK20", "CK100", "CK500", "GA20", "GA100", "GA500"))
I tried to run again the first code after re ordering but it didn't work...
You can also find attached the image of the boxplot that I'm trying to modify
Thanks
EDIT :
> dput(head(H1, 20))
structure(list(Genotype = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L), .Label = c("F1045",
"FF", "M1585", "M1610"), class = "factor"), X = structure(c(1L,
105L, 116L, 127L, 138L, 149L, 160L, 171L, 182L, 2L, 13L, 24L,
35L, 46L, 57L, 68L, 79L, 90L, 101L, 106L), .Label = c("H1", "H10",
"H100", "H101", "H102", "H103", "H104", "H105", "H106", "H107",
"H108", "H109", "H11", "H110", "H111", "H112", "H113", "H114",
"H115", "H116", "H117", "H118", "H119", "H12", "H120", "H121",
"H122", "H123", "H124", "H125", "H126", "H127", "H128", "H129",
"H13", "H130", "H131", "H132", "H133", "H134", "H135", "H136",
"H137", "H138", "H139", "H14", "H140", "H141", "H142", "H143",
"H144", "H145", "H146", "H147", "H148", "H149", "H15", "H150",
"H151", "H152", "H153", "H154", "H155", "H156", "H157", "H158",
"H159", "H16", "H160", "H161", "H162", "H163", "H164", "H165",
"H166", "H167", "H168", "H169", "H17", "H170", "H171", "H172",
"H173", "H174", "H175", "H176", "H177", "H178", "H179", "H18",
"H180", "H181", "H182", "H183", "H184", "H185", "H186", "H187",
"H188", "H189", "H19", "H190", "H191", "H192", "H2", "H20", "H21",
"H22", "H23", "H24", "H25", "H26", "H27", "H28", "H29", "H3",
"H30", "H31", "H32", "H33", "H34", "H35", "H36", "H37", "H38",
"H39", "H4", "H40", "H41", "H42", "H43", "H44", "H45", "H46",
"H47", "H48", "H49", "H5", "H50", "H51", "H52", "H53", "H54",
"H55", "H56", "H57", "H58", "H59", "H6", "H60", "H61", "H62",
"H63", "H64", "H65", "H66", "H67", "H68", "H69", "H7", "H70",
"H71", "H72", "H73", "H74", "H75", "H76", "H77", "H78", "H79",
"H8", "H80", "H81", "H82", "H83", "H84", "H85", "H86", "H87",
"H88", "H89", "H9", "H90", "H91", "H92", "H93", "H94", "H95",
"H96", "H97", "H98", "H99"), class = "factor"), Hormone = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("CK", "Control", "GA"), class = "factor"),
Hormone.quantity = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("100",
"20", "500", "Control"), class = "factor"), fusion = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), .Label = c("CK 100", "CK 20", "CK 500",
"Control", "GA 100", "GA 20", "GA 500"), class = "factor"),
DL = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), LI = c(100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), Temperature = c(21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L), Sowing.date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "25-mrt", class = "factor"), BTD10 = structure(c(6L,
7L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
6L, 4L, 4L, 4L), .Label = c("16-apr", "17-apr", "18-apr",
"19-apr", "21-mei", "23-apr", "26-apr", "30-apr"), class = "factor"),
FFLDT = structure(c(13L, 18L, 4L, 9L, 18L, 3L, 2L, 13L, 8L,
10L, 18L, 10L, 8L, 8L, 8L, 10L, 11L, 11L, 1L, 11L), .Label = c("",
"10-mei", "14-apr", "14-mei", "17-mei", "18-jun", "21-mei",
"23-apr", "24-mei", "26-apr", "28-mei", "3-apr", "3-mei",
"30-apr", "31-mei", "4-jun", "7-jun", "7-mei"), class = "factor"),
FH = structure(c(42L, 62L, 67L, 18L, 59L, 7L, 5L, 52L, 53L,
62L, 65L, 58L, 53L, 42L, 52L, 58L, 24L, 55L, 1L, 54L), .Label = c("",
"10", "10,5", "11", "11,5", "11,7", "12", "12,3", "12,5",
"13", "13,5", "14", "14,3", "14,5", "15", "15,3", "15,5",
"16", "16-jan", "17", "18", "18,5", "19", "20", "20,5", "21",
"21,5", "22", "22,5", "23", "23,5", "24,5", "25", "25,5",
"26", "26,5", "27", "27,5", "29", "29-mei", "3", "3,5", "30",
"30,5", "31,5", "32", "32,5", "33", "35", "36", "37", "4",
"4,5", "40", "42", "43", "47", "5", "5,5", "53", "55", "6",
"6,5", "7", "8", "8,5", "9", "9,5"), class = "factor"), SRDT = structure(c(3L,
8L, 1L, 1L, 8L, 1L, 8L, NA, NA, NA, 4L, NA, 15L, 12L, 14L,
14L, 15L, 15L, 1L, 15L), .Label = c("", "10-mei", "11-jun",
"13-jun", "13-mei", "14-mei", "17-mei", "18-jun", "21-jun",
"21-mei", "24-mei", "28-mei", "3-mei", "31-mei", "4-jun",
"7-jun", "7-mei"), class = "factor"), MH = c(26, 50, NA,
NA, 46, NA, 61, NA, NA, NA, 40, NA, 68, 48, 47, 42, 26, 50,
NA, 48), SEEDT = structure(c(2L, 4L, 1L, 1L, 4L, 1L, 4L,
NA, NA, NA, 4L, NA, 9L, 8L, 8L, 8L, 4L, 3L, 1L, 4L), .Label = c("",
"11-jun", "13-jun", "18-jun", "20-mei", "21-jun", "28-mei",
"31-mei", "4-jun", "6-apr", "7-jun"), class = "factor"),
FERMK = c(7L, 8L, NA, NA, 8L, NA, 8L, NA, NA, NA, 5L, NA,
7L, 6L, 7L, 6L, NA, NA, NA, 4L), PLRMK = c(1L, 2L, NA, NA,
1L, NA, 1L, NA, NA, NA, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, NA,
1L), BT = structure(c(5L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 4L, 4L, 4L), .Label = c("",
"22", "23", "25", "29", "32", "bino"), class = "factor"),
FF = c(39L, 43L, 50L, 60L, 43L, 20L, 46L, 39L, 29L, 32L,
43L, 32L, 29L, 29L, 29L, 32L, 64L, 64L, NA, 64L), FM = c(78L,
85L, NA, NA, 85L, NA, 85L, NA, NA, NA, 80L, NA, 71L, 64L,
67L, 67L, 71L, 71L, NA, 71L), SEED = c(78L, 85L, NA, NA,
85L, NA, 85L, NA, NA, NA, 85L, NA, 71L, 67L, 67L, 67L, 85L,
80L, NA, 85L)), row.names = c(NA, 20L), class = "data.frame")

scale_colour_hue issue getting Error: Discrete value supplied to continuous scale

I'm attempting to plot coefficient estimates of insect Orders using small_multiple which integrates with ggplo however when I use scale_colour_hue to color code each Order I get Error: Continuous value supplied to discrete scale.
Any help would be appreciated.
Data:
m123456_df:
term estimate std.error statistic group by_2sd model Order
insecticidearea -1.87 1.84 -1.01 fixed TRUE
AcariInsecticideFor Acari
insecticidearea 3.02 1.66 1.80 fixed TRUE
AraneaeInsecticideFor Araneae
insecticidearea 28.18 5.76 4.89 fixed TRUE
ColeopteraInsecticideFor Coleoptera
insecticidearea -2.60 3.52 -0.73 fixed TRUE
DipteraInsecticideFor Diptera
insecticidearea -6.97 7.85 -0.88 fixed TRUE
HemipteraInsecticideFor Hemiptera
insecticidearea 5.47 2.96 1.84 fixed TRUE
HomopteraInsecticideFor Homoptera
insecticidearea -3.98 4.13 -0.96 fixed TRUE
HymenopteraInsecticideFor Hymenoptera
insecticidearea -0.07 0.68 -0.11 fixed TRUE
LepidopteraInsecticideFor Lepidoptera
insecticidearea -9.98 3.28 -3.03 fixed TRUE
OdonataInsecticideFor Odonata
insecticidearea -0.60 0.83 -0.72 fixed TRUE
OrthopteraInsecticideFor Orthoptera
insecticidearea -1.97 1.70 -1.15 fixed TRUE
ThysanopInsecticideFor Thysanoptera
To better see the structure of the data,
dput(m123456_df):
"structure(list(X = c(49L, 50L, 51L, 52L, 53L, 169L, 170L, 171L,
172L, 173L, 1L, 2L, 3L, 4L, 109L, 110L, 111L, 112L, 113L, 54L,
55L, 56L, 57L, 58L, 174L, 175L, 176L, 177L, 178L, 5L, 6L, 7L,
8L, 114L, 115L, 116L, 117L, 118L, 59L, 60L, 61L, 62L, 63L, 179L,
180L, 181L, 182L, 183L, 9L, 10L, 11L, 12L, 119L, 120L, 121L,
122L, 123L, 69L, 70L, 71L, 72L, 73L, 189L, 190L, 191L, 192L,
193L, 17L, 18L, 19L, 20L, 129L, 130L, 131L, 132L, 133L, 74L,
75L, 76L, 77L, 78L, 194L, 195L, 196L, 197L, 198L, 21L, 22L, 23L,
24L, 134L, 135L, 136L, 137L, 138L, 79L, 80L, 81L, 82L, 83L, 199L,
200L, 201L, 202L, 203L, 25L, 26L, 27L, 28L, 139L, 140L, 141L,
142L, 143L, 84L, 85L, 86L, 87L, 88L, 204L, 205L, 206L, 207L,
208L, 29L, 30L, 31L, 32L, 144L, 145L, 146L, 147L, 148L, 89L,
90L, 91L, 92L, 93L, 209L, 210L, 211L, 212L, 213L, 33L, 34L, 35L,
36L, 149L, 150L, 151L, 152L, 153L, 94L, 95L, 96L, 97L, 98L, 214L,
215L, 216L, 217L, 218L, 37L, 38L, 39L, 40L, 154L, 155L, 156L,
157L, 158L, 99L, 100L, 101L, 102L, 103L, 219L, 220L, 221L, 222L,
223L, 41L, 42L, 43L, 44L, 159L, 160L, 161L, 162L, 163L, 104L,
105L, 106L, 107L, 108L, 224L, 225L, 226L, 227L, 228L, 45L, 46L,
47L, 48L, 164L, 165L, 166L, 167L, 168L), term = structure(c(1L,
3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L,
6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L,
1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L,
2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L,
6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L,
1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L,
2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L,
7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L,
5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L,
6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L, 4L, 2L, 6L,
7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L, 6L, 7L, 1L,
4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L, 1L, 3L, 2L,
6L, 7L, 1L, 4L, 2L, 6L, 7L, 1L, 2L, 6L, 7L, 1L, 5L, 2L, 6L, 7L
), .Label = c("(Intercept)", "doy", "insecticidearea", "neonicarea",
"pesticidearea", "sd_(Intercept).SiteID.x", "sd_Observation.Residual"
), class = "factor"), estimate = c(5.833565955, -1.872580966,
0.227436188, 9.992583603, 6.852396625, 5.142632969, -0.678674828,
0.254534918, 9.864923466, 6.97100003, 4.477798595, 0.039781365,
9.850994785, 6.987034948, 4.3283009, 0.123013649, 0.022392237,
9.838260007, 6.994754179, 17.88900765, 3.029762821, 10.65695216,
2.226434694, 16.74910855, 23.73870445, -6.329202795, 11.20683527,
3.142802325, 16.38385835, 19.72583775, 10.76829079, 1.846216149,
16.86087792, 20.66463849, -0.908216649, 10.86819106, 2.09135404,
16.82721692, 23.13229672, 28.18575111, 7.732269676, 18.27238254,
56.95696509, 46.09461569, -8.045705619, 10.81339723, 19.1102249,
58.22254846, 40.85473609, 10.14359574, 18.64465552, 58.48404924,
36.83632929, 4.404336962, 9.931425672, 18.16739664, 58.57271159,
39.6406962, -2.605713718, 9.307524378, 15.82145387, 34.13815459,
36.98723437, 1.498641219, 8.947675916, 15.38990904, 34.28639819,
37.99163279, 9.083495066, 15.55045251, 34.24464848, 38.25136135,
-0.276226574, 9.096730123, 15.56422095, 34.24013774, 26.01030204,
-6.971380003, 9.152834977, 0, 76.54684844, 18.81324754, 4.584413346,
8.566929469, 0, 76.59232376, 21.71528903, 8.750358994, 0, 76.62612401,
21.52279232, 0.204300155, 8.743035747, 0, 76.62605563, 23.1671264,
5.474675587, 10.61361745, 7.81542411, 27.03810868, 34.70000476,
-11.75757077, 11.81009181, 9.412199453, 26.11205322, 27.01646276,
11.12642064, 7.617789448, 27.21673834, 30.33676716, -3.179909206,
11.50125637, 8.155250611, 27.04250033, 49.27043746, -3.98632448,
5.038285369, 19.08368393, 36.3099002, 44.65665992, 3.305098051,
4.660102066, 19.15206617, 36.29619993, 46.86770473, 4.875937397,
19.07213112, 36.36348295, 43.97229941, 3.034326999, 4.717080506,
19.16011396, 36.30805912, 5.398084837, -0.079315577, 2.151502251,
0, 5.373302675, 5.618080493, -0.492859394, 2.150411138, 0, 5.367888491,
5.34786557, 2.145072919, 0, 5.373448857, 6.696394307, -1.214487209,
2.183364895, 0, 5.338975281, 7.142053223, -9.981494663, -6.62191384,
6.870891062, 0.081012541, 7.347328261, -0.955827518, 4.837078368,
6.183147924, 0.245901403, 6.703342056, 4.82069418, 6.322801182,
0.729648159, 12.34478985, -4.615010887, 4.343005964, 4.884578148,
0.209518655, 5.332438336, -0.604879396, 0.877454525, 8.290188235,
4.675812692, 5.70857201, -1.066255113, 1.034587249, 8.238516889,
4.671352748, 4.962001457, 0.863341599, 8.327281966, 4.669175273,
5.976663938, -1.049824638, 0.915523839, 8.275547511, 4.677431349,
5.945020749, -1.977904212, -1.231110798, 2.537802029, 10.77392762,
5.579913084, -1.248700513, -1.384542346, 2.633756398, 10.77936693,
4.732988096, -1.412934304, 2.432763407, 10.84148556, 5.62838417,
-1.108552993, -1.46904816, 2.412972399, 10.83202785), std.error = c(2.345020833,
1.840768317, 1.750080908, NA, NA, 2.554842927, 1.701707856, 1.833883917,
NA, NA, 1.937814035, 1.755048913, NA, NA, 3.153724651, 2.050476456,
1.779085063, NA, NA, 1.814657198, 1.663420324, 1.66101237, NA,
NA, 1.837869979, 1.636172422, 1.643826419, NA, NA, 1.517843621,
1.665313905, NA, NA, 2.32233174, 1.679925176, 1.67454475, NA,
NA, 6.545154384, 5.761076413, 5.699062437, NA, NA, 6.737162579,
5.737241184, 5.834839768, NA, NA, 5.593994125, 5.82677481, NA,
NA, 7.906973554, 6.089892019, 5.829353741, NA, NA, 3.99576339,
3.52936854, 3.444993938, NA, NA, 3.983759621, 3.318339418, 3.446513164,
NA, NA, 3.310686622, 3.436527267, NA, NA, 4.845766085, 3.764235629,
3.440400053, NA, NA, 8.628004354, 7.855856109, 7.876962746, NA,
NA, 8.728305686, 7.915196489, 7.874937593, NA, NA, 7.150052148,
7.872039214, NA, NA, 10.28405417, 7.845041369, 7.87705336, NA,
NA, 3.377133084, 2.967940792, 2.915310064, NA, NA, 3.233316178,
2.815957854, 2.853379324, NA, NA, 2.666723896, 2.914377219, NA,
NA, 4.171797935, 3.048878466, 2.928824718, NA, NA, 4.479237651,
4.131956214, 4.007895041, NA, NA, 4.519776015, 3.863296489, 4.011682605,
NA, NA, 3.722679065, 4.008923785, NA, NA, 5.663222384, 4.498583965,
4.011828348, NA, NA, 0.7259094, 0.686957844, 0.690230788, NA,
NA, 0.693440783, 0.691612154, 0.687328516, NA, NA, 0.581187607,
0.687999626, NA, NA, 0.952040753, 0.681680334, 0.683923538, NA,
NA, 3.810333838, 3.28639311, 1.520388256, NA, NA, 3.276429106,
0.336977305, 3.998119731, NA, NA, 3.278016524, 4.193948156, NA,
NA, 3.162247377, 1.072403509, 2.824905614, NA, NA, 1.125687092,
0.833751484, 0.810504208, NA, NA, 1.120076008, 0.722188325, 0.815860562,
NA, NA, 1.00910672, 0.809981199, NA, NA, 1.508858442, 1.172058637,
0.81176584, NA, NA, 1.839933156, 1.709077899, 1.713608126, NA,
NA, 1.909498614, 1.709505119, 1.711082298, NA, NA, 1.51769132,
1.713651686, NA, NA, 2.06553118, 1.737291954, 1.713781907, NA,
NA), statistic = c(2.487639287, -1.017282267, 0.129957528, NA,
NA, 2.012895946, -0.398819824, 0.138795545, NA, NA, 2.310747324,
0.022666813, NA, NA, 1.372440964, 0.059992715, 0.012586378, NA,
NA, 9.85806447, 1.821405437, 6.41593787, NA, NA, 12.91642212,
-3.868298175, 6.817529602, NA, NA, 12.99596182, 6.466222833,
NA, NA, 8.898228507, -0.540629227, 6.49023626, NA, NA, 3.534262962,
4.89244528, 1.356761706, NA, NA, 6.841844047, -1.402364893, 1.853246646,
NA, NA, 7.303321237, 1.740859407, NA, NA, 4.658714113, 0.723220863,
1.703692401, NA, NA, 9.920681566, -0.738294595, 2.701753485,
NA, NA, 9.28450456, 0.451623849, 2.596153123, NA, NA, 11.47545423,
2.643219261, NA, NA, 7.89376967, -0.07338185, 2.644090798, NA,
NA, 3.014637102, -0.887411875, 1.161975151, NA, NA, 2.155429497,
0.579191351, 1.087872681, NA, NA, 3.037081209, 1.11157462, NA,
NA, 2.092831481, 0.026041947, 1.109937352, NA, NA, 6.85999806,
1.844604044, 3.640647895, NA, NA, 10.73201718, -4.175336201,
4.138984155, NA, NA, 10.13095612, 3.817769562, NA, NA, 7.271868781,
-1.042976702, 3.926918637, NA, NA, 10.9997373, -0.964754773,
1.257090148, NA, NA, 9.880281628, 0.8555124, 1.161632792, NA,
NA, 12.58977847, 1.216270914, NA, NA, 7.764536942, 0.674507139,
1.175793204, NA, NA, 7.436306566, -0.115459162, 3.11707662, NA,
NA, 8.101745142, -0.712623963, 3.12865113, NA, NA, 9.201616665,
3.117840238, NA, NA, 7.033726533, -1.781608106, 3.192410809,
NA, NA, 1.874390415, -3.037218716, -4.355409753, NA, NA, 2.242480463,
-2.836474455, 1.209838297, NA, NA, 2.044938458, 1.149440575,
NA, NA, 3.903802699, -4.303427628, 1.53739861, NA, NA, 4.737052041,
-0.725491237, 1.082603294, NA, NA, 5.096593419, -1.476422528,
1.268093222, NA, NA, 4.917221697, 1.065878567, NA, NA, 3.961050137,
-0.895709997, 1.127817647, NA, NA, 3.231106918, -1.157293189,
-0.718431933, NA, NA, 2.922187554, -0.730445612, -0.80916175,
NA, NA, 3.118544617, -0.824516625, NA, NA, 2.72490884, -0.638092515,
-0.857196679, NA, NA), group = structure(c(1L, 1L, 1L, 3L, 2L,
1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L,
1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L,
2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L,
3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L,
2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L,
3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L,
1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L,
1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L,
2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L,
1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L, 1L, 1L,
1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 2L), .Label = c("fixed",
"Residual", "SiteID.x"), class = "factor"), by_2sd = c(TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE),
model = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L,
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L,
17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 19L, 19L,
19L, 19L, 20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 24L, 24L, 24L,
24L, 24L, 25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L,
27L, 27L, 27L, 27L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L,
29L, 29L, 30L, 30L, 30L, 30L, 30L, 31L, 31L, 31L, 31L, 32L,
32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 34L, 34L, 34L,
34L, 34L, 35L, 35L, 35L, 35L, 36L, 36L, 36L, 36L, 36L, 37L,
37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L, 38L, 39L, 39L, 39L,
39L, 40L, 40L, 40L, 40L, 40L, 41L, 41L, 41L, 41L, 41L, 42L,
42L, 42L, 42L, 42L, 43L, 43L, 43L, 43L, 44L, 44L, 44L, 44L,
44L), .Label = c("AcariInsecticideFor", "AcariNeonicFor",
"AcariNullFor", "AcariPesticideFor", "AraneaeInsecticideFor",
"AraneaeNeonicFor", "AraneaeNullFor", "AraneaePesticideFor",
"ColeopteraInsecticideFor", "ColeopteraNeonicFor", "ColeopteraNullFor",
"ColeopteraPesticideFor", "DipteraInsecticideFor", "DipteraNeonicFor",
"DipteraNullFor", "DipteraPesticideFor", "HemipteraInsecticideFor",
"HemipteraNeonicFor", "HemipteraNullFor", "HemipteraPesticideFor",
"HomopteraInsecticideFor", "HomopteraNeonicFor", "HomopteraNullFor",
"HomopteraPesticideFor", "HymenopteraInsecticideFor",
"HymenopteraNeonicFor",
"HymenopteraNullFor", "HymenopteraPesticideFor",
"LepidopteraInsecticideFor",
"LepidopteraNeonicFor", "LepidopteraNullFor", "LepidopteraPesticideFor",
"OdonataInsecticideFor", "OdonataNeonicFor", "OdonataNullFor",
"OdonataPesticideFor", "OrthopteraInsecticideFor", "OrthopteraNeonicFor",
"OrthopteraNullFor", "OrthopteraPesticideFor",
"ThysanopteraInsecticideFor",
"ThysanopteraNeonicFor", "ThysanopteraNullFor", "ThysanopteraPesticideFor"
), class = "factor"), Order = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L), .Label = c("Acari", "Araneae", "Coleoptera",
"Diptera", "Hemiptera", "Homoptera", "Hymenoptera", "Lepidoptera",
"Odonata", "Orthoptera", "Thysanoptera"), class = "factor")), row.names =
c(NA,
-209L), class = "data.frame")
Plot code:
#required packages
library(dotwhisker)
library(broom)
library(lme4)
m123456_df<-read.csv("C:/Users/breiley/Desktop/m123456_df.csv")
m123456_df$Order=as.factor(m123456_df$Order)
# Relabel predictors (they will appear as facet labels)
m123456_df <- m123456_df %>%
relabel_predictors(c("(Intercept)" = "Intercept",
neonicarea = "Neonictinoid",
insecticidearea= "All insecticide",
pesticidearea = "All pesticide" ))
m123456_df$Order=as.factor(m123456_df$Order)
# Generate a 'small multiple' plot
small_multiple(m123456_df) +
theme_bw() + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+ylab("Coefficient estimate") +
geom_hline(yintercept = 0, colour = "grey60", linetype = 2) +
scale_colour_hue(name = "Order",
breaks = c(0,1,2,3,4,5,6,7,8,9,10),
labels = c("Acari",
"Araneae","Coleoptera","Diptera","Hemiptera","Homoptera",
"Hymenoptera","Lepidoptera","Odonata","Orthoptera","Thysanoptera"))+
theme(legend.position = c(0.02, 0.008),
legend.justification=c(0, 0),legend.title = element_text(size=8),
legend.background = element_rect(color="gray90"),
legend.spacing = unit(-4, "pt"),
legend.key.size = unit(10, "pt"))
ggtitle("Arthropod temporal trends") +
theme(plot.title = element_text(face =
"bold"))+scale_colour_discrete(na.translate = F)
ggsave("C:/Users/breiley/Desktop/ForestPesticide.png",width=10,
height=10,dpi=300)

Speeding up a loop (extracting specific values from a data frame)

My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))

using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]

This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

Conditionally remove a row based on another id code

In a dataset which contains many ids, I am only trying to manipulate rows which have id 7 or 9, and leave everything else untouched.
I am trying to conditionally remove a row from 7 or 9 in all instances where there isn't a variable that corresponds to it. So, if in the case of the dput example below, I want to remove the ninth row from id=9 because id=7 does not have an itemcode=2. Vice versa for id=7, I am trying to remove its itemcode=9 because id=9 does not have it.
id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
8 7 Bob ninth 9 100 11 21 10 NA NA NA NA
9 9 Bob_new first 1 100 NA NA NA 23 18 25 18
Code:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob",
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L,
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth",
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L,
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"),
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15",
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L,
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"),
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L,
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15",
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L,
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"),
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L,
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18",
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 = structure(c(8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L,
3L), .Label = c("12", "13", "16", "18", "19", "21", "24",
"NA"), class = "factor")), .Names = c("id", "client", "item",
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005",
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))
————————————————————————————————————————
ANOTHER SCENARIO:
before:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L,
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth",
"fifth", "first", "fourth", "ninth", "second", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA,
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L,
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L,
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L,
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA,
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L,
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA,
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L,
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA,
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id",
"client", "item", "itemcode", "type", "unit", "X2001", "X2002",
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-20L))
after:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L,
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L,
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L,
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L,
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA,
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L),
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L,
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L,
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L,
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client",
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003",
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-13L))
I could implement the said filter code to remove items which do not exist in its corresponding place (id 7 and 9).
But if there are sub levels for items, like type of item. I am also trying to remove items if they don't have a type similar in the corresponding field.

You could use filter from dplyr
library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7],
itemcode[id==9])|!id %in% c(7,9) )
# id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1 7 Bob first 1 100 24 17 23 NA NA NA NA
#2 7 Bob third 3 100 25 24 10 NA NA NA NA
#3 7 Bob fourth 4 100 11 13 19 NA NA NA NA
#4 7 Bob fifth 5 100 22 15 11 NA NA NA NA
#5 7 Bob sixth 6 100 12 22 24 NA NA NA NA
#6 7 Bob seventh 7 100 24 14 10 NA NA NA NA
#7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
#8 9 Bob_new first 1 100 NA NA NA 23 18 25 18
#9 9 Bob_new third 3 100 NA NA NA 14 19 10 21
#10 9 Bob_new fourth 4 100 NA NA NA 25 18 15 13
#11 9 Bob_new fifth 5 100 NA NA NA 11 25 20 18
#12 9 Bob_new sixth 6 100 NA NA NA 15 18 18 12
#13 9 Bob_new seventh 7 100 NA NA NA 20 11 23 19
#14 9 Bob_new eighth 8 100 NA NA NA 15 19 25 19
#15 10 Mark first 1 100 NA NA NA 20 18 19 16
Update
Based on the new dataset, perhaps this helps
library(dplyr)
library(tidyr)
dfnew %>%
unite(itemtype, itemcode,type) %>%
filter(itemtype %in% intersect(itemtype[id==7],
itemtype[id==9])|!id %in% c(7,9)) %>%
separate(itemtype, c('itemcode', 'type'))
# id client item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1 7 Bob first 1 A 100 24 17 23 NA NA NA
# 2 7 Bob third 3 B 100 30 12 46 43 97 11
# 3 7 Bob fourth 4 A 100 11 13 19 NA NA NA
# 4 7 Bob fifth 5 A 100 22 15 11 NA NA NA
# 5 7 Bob sixth 6 A 100 12 22 24 NA NA NA
# 6 7 Bob seventh 7 A 100 25 21 19 6 17 11
# 7 9 Bob_new first 1 A 100 NA NA NA 23 18 25
# 8 9 Bob_new third 3 B 100 NA NA NA 14 19 10
# 9 9 Bob_new fourth 4 A 100 NA NA NA 25 18 15
# 10 9 Bob_new fifth 5 A 100 NA NA NA 11 25 20
# 11 9 Bob_new sixth 6 A 100 NA NA NA 15 18 18
# 12 9 Bob_new seventh 7 A 100 NA NA NA 20 11 23
# 13 10 Mark first 1 A 100 NA NA NA 20 18 19
# X2007
#1 NA
#2 19
#3 NA
#4 NA
#5 NA
#6 15
#7 18
#8 21
#9 13
#10 18
#11 12
#12 19
#13 16

If I understand the problem: every itemcode in id=9 subset must have identical itemcode in id=7 subset (and reverse). If it is not the case then we filter the row with the non-pair itemcode out, but leave everything with id not in 7 or 9. Here is one way of doing it:
First get common item codes:
items_9 <- df_all$itemcode[ df_all$id==9 ]
items_7 <- df_all$itemcode[ df_all$id==7 ]
items_common <- items_9[ items_9 %in% items_7 ]
select everything with common itemcodes for 7 and 9 and the rest:
df_new <- df_all[
which(
( df_all$id %in% c(7, 9) &
df_all$itemcode %in% items_common
) |
!df_all$id %in% c(7,9)
)
,]

library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
remove %in% intersect(remove[type==7],
remove[type==9])|!type %in% c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL

You could do something like this, which runs setdiff in both directions. The cl() function wasn't really necessary, but I really don't like writing the same expression over and over again.
f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))
So now you can call f() on c(id7, id9) and then reverse it and get the c(id9, id7) result.
do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9

R - Lattice xyplot - How do you add error bars to groups and summary lines?

I'm posting this question because the very similar question here has not been answered until now.
I have been asked to plot the mean +/- SEM of my whole cohort of patients over the xyplot() that depicts the values of all patients. The data used represents intraoperative cardiovascular findings from patients undergoing surgery.
This is my data.frame called df
dput(df)
structure(list(Name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("DE", "JS", "KG", "MK", "TG", "WT"), class = "factor"),
Time = structure(c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 1L, 2L, 3L,
4L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 7L, 8L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 2L, 3L, 4L, 5L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L), .Label = c("T1", "T2", "T3", "T4", "T5", "T6", "T7",
"T8"), class = "factor"), Dobut = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", "Yes"
), class = "factor"), DobutDose = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
4L, 6L, 8L, 8L, 8L, 8L, 8L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 5L, 5L, NA), CI = c(1.4, 2.3, 1.3, 1.8, 2.1,
2, 2.1, 2.1, 2.3, 1.9, 1.6, 2, 2.4, 2.7, 2.6, 2.7, 2.6, 2.3,
2.4, 2.6, 0.9, 2.5, 2.1, 1.6, 1.5, 1.8, 2, 2, 1.9, 2.1, 2.3,
2, 2.4, 2.3, 2.6, 2.4, 2, 2.2, 1.6, 2.1, 2.5, 2.8), SvO2 = c(57L,
65L, 47L, 45L, 51L, 60L, 56L, 70L, 85L, 75L, 79L, 82L, 73L,
77L, 78L, 73L, 71L, 73L, 80L, 74L, 41L, 66L, 51L, 51L, 49L,
54L, 68L, 48L, 80L, 70L, 71L, 69L, 74L, 79L, 77L, 77L, 75L,
74L, 70L, 79L, 80L, 79L), SVRI = c(4000L, 1983L, 4000L, 2444L,
1981L, 2120L, 2514L, 2971L, 2157L, 3747L, 4300L, 3200L, 2867L,
1778L, 1169L, 1215L, 1262L, 1461L, 1600L, 1692L, 4978L, 1760L,
2019L, 2650L, 2827L, 2356L, 1800L, 2840L, 2063L, 2248L, 1948L,
2160L, 1733L, 2296L, 2677L, 2100L, 2640L, 2655L, 3950L, 2210L,
2848L, 2543L), MAP = c(80L, 65L, 86L, 74L, 67L, 65L, 74L,
90L, 70L, 90L, 96L, 94L, 100L, 82L, 60L, 61L, 62L, 62L, 69L,
71L, 70L, 71L, 77L, 73L, 75L, 77L, 61L, 85L, 65L, 74L, 70L,
67L, 69L, 74L, 92L, 71L, 88L, 93L, 89L, 79L, 97L, 97L), CVP = c(10L,
8L, 21L, 19L, 15L, 12L, 8L, 12L, 8L, 11L, 10L, 14L, 14L,
22L, 22L, 20L, 21L, 20L, 21L, 16L, 14L, 16L, 24L, 20L, 22L,
24L, 16L, 14L, 16L, 15L, 14L, 13L, 17L, 8L, 5L, 8L, 22L,
20L, 20L, 21L, 8L, 8L), PAP = c(23L, 22L, 36L, 36L, 34L,
32L, 22L, 33L, 28L, 36L, 36L, 40L, 37L, 37L, 40L, 35L, 35L,
34L, 38L, 36L, 45L, 43L, 55L, 49L, 52L, 54L, 43L, 47L, 27L,
25L, 23L, 22L, 28L, 21L, 20L, 25L, 33L, 33L, 38L, 35L, 33L,
29L), PCWP = c(15L, 11L, 28L, 26L, 23L, 21L, 11L, 26L, NA,
NA, 25L, 25L, NA, 27L, NA, NA, NA, NA, NA, NA, 30L, NA, NA,
NA, NA, NA, NA, NA, 19L, NA, NA, NA, NA, NA, 16L, NA, NA,
NA, NA, NA, NA, NA)), .Names = c("Name", "Time", "Dobut",
"DobutDose", "CI", "SvO2", "SVRI", "MAP", "CVP", "PAP", "PCWP"
), class = "data.frame", row.names = c(NA, -42L))
Now the first xyplot I made for the variable CI looks like this
require(lattice)
xyplot(CI~Time, groups=Name, data=df, ty=c("l", "p"),
+ ,xlab="Measurement Time Point",
ylab=expression("CI"~(l/min/m^"2")), main="Cardiac Index")
Now I was able to add the mean (black line) of the whole cohort, by doing the following
xyplot(CI~Time, groups=Name, data=df, ty=c("l", "p"),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.linejoin(x, y, horizontal = FALSE,..., col="black", lty=1, lwd=4)
}
,xlab="Measurement Time Point",
ylab=expression("CI"~(l/min/m^"2")), main="Cardiac Index")
Now I'd like to add +/- SE to the mean as a line above/below the mean, but nowhere can I find how to do this.
What I can do is using the latticeExtra package is add the loess line +/- SE, as below, but that's not the correct mathematical function I'm looking for. I've left the mean line in there to illustrate the difference between the two.
require(latticeExtra)
xyplot(CI~Time, groups=Name, data=df, ty=c("l", "p"),
+ panel = function(x, y, ...) {
+ panel.xyplot(x, y, ...)
+ panel.linejoin(x, y, horizontal = FALSE,..., col="black", lty=1, lwd=4)
+ panel.smoother(x,y,se=TRUE, col.se="grey")
+ }
+ ,xlab="Measurement Time Point",
ylab=expression("CI"~(l/min/m^"2")), main="Cardiac Index")
I have performed an extensive search through SO and the internet, but I haven't been able to find the right function to do this.
Help is very much appreciated! Thanks.

You could create your own panel function to plot a +/- SD region. For example
#new panel function
panel.se <- function(x, y, col.se=plot.line$col, alpha.se=.25, ...) {
plot.line <- trellis.par.get("plot.line")
xs <- if(is.factor(x)) {
factor(c(levels(x) , rev(levels(x))), levels=levels(x))
} else {
xx <- sort(unique(x))
c(xx, rev(xx))
}
means <- tapply(y,x, mean, na.rm=T)
vars <- tapply(y,x, var, na.rm=T)
Ns <- tapply(!is.na(y),x, sum)
ses <- sqrt(vars/Ns)
panel.polygon(xs, c(means+ses, rev(means-ses)), col=col.se, alpha=alpha.se)
}
and then you can use it like
#include new panel function
xyplot(CI~Time, groups=Name, data=df, ty=c("l", "p"),
panel = function(x, y, ...) {
panel.se(x,y, col.se="grey")
panel.xyplot(x, y, ...)
panel.linejoin(x, y, horizontal = FALSE,..., col="black", lty=1, lwd=4)
}
,xlab="Measurement Time Point",
ylab=expression("CI"~(l/min/m^"2")), main="Cardiac Index")
which results in

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

How to remove observations from multiple dataframes and keep as multiple dataframes - r

Related

How to re order datas in ggplot2

scale_colour_hue issue getting Error: Discrete value supplied to continuous scale

Speeding up a loop (extracting specific values from a data frame)

Conditionally remove a row based on another id code

R - Lattice xyplot - How do you add error bars to groups and summary lines?

Categories

Resources