Conditionally subsetting a list of dataframes in R - r

I have a list of dataframes called myList (see sample below) and all I want is to subset that list of dataframes by the condition that only rows with a "pointNum" > 100 are included in the new list. Should be easy but I just can't get it to work. So the output should look like this for the first item on the list:
[[1]]
study Identi locDate locNumb meanLat meanLon pointNum
5 study 1 SDU101 2011-07-13 49 32.8837771221667 -117.24038866075 120
9 study 1 SDU101 2011-07-13 60 32.8838778530086 -117.240522195673 349
11 study 1 SDU101 2011-07-13 321 32.8027296698536 -117.210527201581 683
I've been trying to get this to work, and other similar subsetting options. It currently runs but doesn't do anything:
newList = lapply(myList, function(x) { subset(x, "pointNum" > 2)} )
I know that similar questions have been posted, but I couldn't get any of those solutions to to work for my particular problem. Any help would be greatly appreciated.
myList <- list(structure(list(study = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Study 1", class = "factor"),
Identi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "SDU101", class = "factor"),
locDate = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "2011-07-13", class = "factor"),
locNumb = structure(c(12L, 15L, 1L, 2L, 8L, 9L, 10L, 11L,
13L, 14L, 3L, 4L, 5L, 6L, 7L), .Label = c("10", "11", "321",
"323", "324", "326", "329", "49", "56", "57", "59", "6",
"60", "61", "7"), class = "factor"), meanLat = structure(c(11L,
10L, 4L, 9L, 6L, 8L, 3L, 5L, 7L, 12L, 1L, 15L, 13L, 14L,
2L), .Label = c("32.8027296698536", "32.802755201875", "32.883244695",
"32.8835599674286", "32.8837003266667", "32.8837771221667",
"32.8838778530086", "32.88411147", "32.88419565", "32.8841969254545",
"32.884720435", "32.8853723146154", "32.8853777533333", "32.8854051",
"32.9164754136842"), class = "factor"), meanLon = structure(c(13L,
10L, 12L, 15L, 9L, 8L, 7L, 4L, 11L, 6L, 2L, 3L, 14L, 5L,
1L), .Label = c("-117.210382870833", "-117.210527201581",
"-117.236141991053", "-117.239834913333", "-117.23989078",
"-117.240133633077", "-117.240140015", "-117.24022087", "-117.24038866075",
"-117.240416713636", "-117.240522195673", "-117.240532619714",
"-117.24062533", "-117.24063566", "-117.24070002"), class = "factor"),
pointNum = structure(c(6L, 2L, 9L, 1L, 3L, 1L, 6L, 7L, 8L,
4L, 11L, 5L, 7L, 1L, 10L), .Label = c("1", "11", "120", "13",
"19", "2", "3", "349", "35", "48", "683"), class = "factor")), .Names = c("study",
"Identi", "locDate", "locNumb", "meanLat", "meanLon", "pointNum"
), row.names = c(NA, -15L), class = "data.frame"), structure(list(
study = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "Study 1", class = "factor"),
Identi = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "SDU111", class = "factor"),
locDate = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "2011-07-12", class = "factor"),
locNumb = structure(c(14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L), .Label = c("354", "418", "419",
"420", "421", "422", "425", "426", "427", "428", "429", "430",
"432", "67"), class = "factor"), meanLat = structure(c(2L,
3L, 9L, 1L, 5L, 8L, 13L, 11L, 6L, 7L, 10L, 12L, 14L, 4L), .Label = c("32.8651107616667",
"32.86543857", "32.867004565", "32.868283279", "32.868857725",
"32.869014345", "32.8692111971429", "32.8693627126536", "32.8694241808955",
"32.8694814566667", "32.86955278", "32.8696187847619", "32.8696329253571",
"32.8698972233333"), class = "factor"), meanLon = structure(c(13L,
12L, 8L, 14L, 11L, 2L, 7L, 5L, 4L, 1L, 3L, 9L, 6L, 10L), .Label = c("-117.235456126857",
"-117.235585179972", "-117.235959423333", "-117.25006813",
"-117.25014399", "-117.250450876667", "-117.250467514464",
"-117.25050148", "-117.250773722857", "-117.2512085715",
"-117.25133879", "-117.25283091", "-117.254194355", "-117.254406255417"
), class = "factor"), pointNum = structure(c(2L, 2L, 11L,
5L, 2L, 8L, 9L, 1L, 2L, 7L, 6L, 4L, 10L, 3L), .Label = c("1",
"2", "20", "21", "24", "3", "35", "358", "56", "6", "67"), class = "factor")), .Names = c("study",
"Identi", "locDate", "locNumb", "meanLat", "meanLon", "pointNum"
), row.names = c(NA, -14L), class = "data.frame"))

You have two issues - extra quotes and your pointNum from your dput is a factor, so do this:
lapply(myList, function(x) { subset(x, as.integer(as.character(pointNum)) > 2)} )

Related

Read CSV file up to line with unique marker

I have many data sets that have extra information beyond a certain line. The files are all csv. I would be able to loop through them and read.csv with "skip" argument to clean the top of the data, but the length of the data frames are all different. The only commonality is the "--------------- ---------------- ------ -----" line in the Total column that separates the meaningful data from summaries and extraneous info below it.
Here's how I'm reading in the data without skip = 14 (which is standard across everything).
before<-read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit",
"Total", "Measurement", "Value", "Percent", "CO2" ))
However, the ----- marker maybe a different row, but it's the first thing to hit. Here's the data before:
structure(list(CountryID = structure(c(26L, 19L, 21L, 23L, 21L,
7L, 1L, 1L, 1L, 22L, 3L, 1L, 19L, 2L, 8L, 14L, 15L, 13L, 9L,
12L, 18L, 17L, 8L, 13L, 15L, 10L, 8L, 8L, 11L, 16L, 1L, 1L, 1L,
20L, 4L, 6L, 1L, 25L, 5L, 1L, 1L, 1L, 24L, 1L), .Label = c("",
"------------", "-------------", "---------------", "------------------",
" ", "08.15.1997", "10000", "15000", "200", "2000", "2500", "3000",
"45000", "5000", "7000", "8000", "8300", "Country", "Output",
"Production", "Quantity", "Serial Output", "TOTAL SUM", "Unaccounted",
"United Nations Data"), class = "factor"), Name = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 20L, 2L, 1L, 1L, 1L, 21L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 19L, 1L, 1L, 1L, 1L), .Label = c("",
"--------------------", " ", "Bahrain", "Bangladesh", "Barbados",
"Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia",
"Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria",
"Burkina Faso", "Chad", "Name", "The Bahamas"), class = "factor"),
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "----", " ", "Code", "Type",
"Unit"), class = "factor"), Symbol = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 20L, 22L, 2L, 1L, 1L, 1L, 4L, 5L,
6L, 7L, 9L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 21L, 1L, 1L, 1L,
1L), .Label = c("", "------------", " ", "BAHM", "BAHR",
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL",
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF", "Country",
"private", "Symbol"), class = "factor"), Code = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 19L, 2L, 1L, 1L, 1L, 12L,
15L, 11L, 17L, 4L, 13L, 14L, 9L, 18L, 10L, 5L, 16L, 3L, 7L,
8L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "------------", "1504944270", "2287368539",
"2388991307", "2453202442", "2561470743", "3205402223", "3221488867",
"3230369605", "3247578406", "3712013344", "4307638090", "462793263",
"4835205752", "4854959101", "5842098895", "5932776587", "Code"
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 17L, 16L, 2L, 1L, 1L, 1L, 7L, 9L, 10L, 14L,
12L, 15L, 15L, 11L, 13L, 3L, 8L, 13L, 15L, 6L, 5L, 9L, 1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"-------------", "100", "1109", "27", "35", "40", "45", "58",
"70", "74", "77", "79", "82", "95", "Output", "Per Unit"), class = "factor"),
Total = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 25L,
24L, 2L, 1L, 1L, 1L, 18L, 5L, 17L, 8L, 23L, 20L, 6L, 9L,
7L, 11L, 12L, 13L, 19L, 15L, 14L, 10L, 3L, 16L, 1L, 1L, 1L,
16L, 1L, 1L, 1L, 21L, 1L, 3L, 22L, 4L), .Label = c("", "---------------",
"--------------- ---------------- ------ -----",
"=============== ================ ====== =====",
"126912", "147431", "170553", "175973", "203728", "230761",
"293789", "304471", "376281", "386526", "399160", "4417002",
"476025", "478030", "502999", "51012", "5610654", "56406056",
"93351", "Output", "Total"), class = "factor"), Measurement = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 1L, 1L, 1L, 3L,
9L, 3L, 4L, 10L, 9L, 6L, 4L, 5L, 10L, 7L, 9L, 4L, 8L, 10L,
9L, 1L, 1L, 1L, 1L, 1L, 11L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "--------", "20", "23", "24", "26", "27",
"28", "29", "30", "420", "Measurement"), class = "factor"),
Value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 22L,
23L, 2L, 1L, 1L, 1L, 5L, 19L, 11L, 8L, 3L, 18L, 13L, 6L,
4L, 9L, 14L, 17L, 7L, 10L, 12L, 15L, 1L, 16L, 1L, 1L, 1L,
16L, 1L, 1L, 1L, 20L, 1L, 1L, 21L, 1L), .Label = c("", "----------------",
"15150240", "15891735", "16083459", "16959919", "20350968",
"20909501", "21770264", "25121096", "27726279", "30024743",
"34069742", "34841369", "38498281", "468004111", "49524999",
"50512814", "50568702", "540650", "64506", "Country", "Value"
), class = "factor"), Percent = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 23L, 2L, 1L, 1L, 1L, 11L, 12L, 8L, 3L,
17L, 16L, 5L, 10L, 20L, 9L, 6L, 7L, 4L, 15L, 14L, 22L, 1L,
13L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 19L, 1L, 1L, 18L, 1L), .Label = c("",
"------", "102", "104", "106", "112", "126", "129", "142",
"15", "160", "177", "1775", "180", "191", "24", "25", "5640645",
"650163", "87", "887.5", "95", "Production Percent"), class = "factor"),
CO2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 14L,
2L, 1L, 1L, 1L, 9L, 4L, 9L, 7L, 4L, 5L, 4L, 7L, 4L, 9L, 4L,
11L, 4L, 12L, 10L, 4L, 1L, 6L, 1L, 1L, 1L, 8L, 1L, 1L, 1L,
3L, 1L, 1L, 13L, 1L), .Label = c("", "-----", "?", "0", "0.2",
"0.6", "1", "19.4", "2", "2.2", "4", "5", "564065", "CO2",
"Cur."), class = "factor")), class = "data.frame", row.names = c(NA,
-44L))
And here's how I'm hoping it could look:
structure(list(CountryID = c(10000L, 45000L, 5000L, 3000L, 15000L,
2500L, 8300L, 8000L, 10000L, 3000L, 5000L, 200L, 10000L, 10000L,
2000L, 7000L), Name = structure(c(16L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), .Label = c("Bahrain",
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
"Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil",
"Brunei", "Bulgaria", "Burkina Faso", "The Bahamas"), class = "factor"),
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Unit", class = "factor"),
Symbol = structure(c(1L, 2L, 3L, 4L, 6L, 5L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L), .Label = c("BAHM", "BAHR",
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL",
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF"), class = "factor"),
Code = c(3712013344, 4835205752, 3247578406, 5842098895,
2287368539, 4307638090, 462793263, 3221488867, 5932776587,
3230369605, 2388991307, 4854959101, 1504944270, 2561470743,
3205402223, 2453202442), Unit = c(40L, 58L, 70L, 82L, 77L,
95L, 95L, 74L, 79L, 100L, 45L, 79L, 95L, 35L, 27L, 58L),
Total = c(478030L, 126912L, 476025L, 175973L, 93351L, 51012L,
147431L, 203728L, 170553L, 293789L, 304471L, 376281L, 502999L,
399160L, 386526L, 230761L), Measurement = c(20L, 29L, 20L,
23L, 30L, 29L, 26L, 23L, 24L, 30L, 27L, 29L, 23L, 28L, 30L,
29L), Value = c(16083459L, 50568702L, 27726279L, 20909501L,
15150240L, 50512814L, 34069742L, 16959919L, 15891735L, 21770264L,
34841369L, 49524999L, 20350968L, 25121096L, 30024743L, 38498281L
), Percent = c(160L, 177L, 129L, 102L, 25L, 24L, 106L, 15L,
87L, 142L, 112L, 126L, 104L, 191L, 180L, 95L), CO2 = c(2,
0, 2, 1, 0, 0.2, 0, 1, 0, 2, 0, 4, 0, 5, 2.2, 0)), class = "data.frame", row.names = c(NA,
-16L))
Can this be integrated into the read.csv argument, or is it easier to clean the bottom of it some other way.
Three thoughts:
Use readLines (as #user2554330 suggested), find/remove the specific row, filter it, then parse the text vector with read.csv, the least of the three.
before[seq_len(min(head(which(!grepl("^[^- ]+$", before$Total)),1)-1L,nrow(before))),]; a bit complicated, granted, but it does what you need (assuming that you've already filtered the first 14 rows with skip=.
Use an external script such as sed -e '1,14d;/^[ -]\+$/{g;q;} in a pipe(...)-type thing.
Read it twice. The first time, use readLines("Example.csv"), and look through the lines for the marker of the end of data. Say it's on line n. Then in the second read, use
read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit",
"Total", "Measurement", "Value", "Percent", "CO2" ), nrows = n - 1)
(or maybe nrows will need to be a different value, if you're skipping some).

plot area truncated when using geom_dotplot

consider the following example data:
ex = structure(list(group = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L,
1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 6L, 1L, 2L, 1L, 2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L,
4L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
ID = structure(c(35L, 35L, 35L, 35L, 35L, 35L, 1L, 1L, 1L,
1L, 1L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 21L, 21L, 22L, 22L,
22L, 22L, 2L, 3L, 4L, 5L, 8L, 15L, 16L, 17L, 18L, 19L, 19L,
20L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 30L, 31L, 32L,
33L, 34L), .Label = c("10", "107", "108", "109", "124", "17",
"18", "187", "19", "21", "24", "26", "27", "28", "335", "336",
"339", "340", "341", "342", "38", "39", "576", "577", "578",
"579", "580", "581", "582", "583", "584", "585", "586", "592",
"6"), class = "factor"), value = c(1L, 7L, 4L, 4L, 3L, 9L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 5L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 33L, 27L, 28L, 21L, 28L, 1L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 2L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L)), class = "data.frame",
row.names = c(NA, -88L), .Names = c("group", "ID", "value")
)
Note that in group A, value = 1 for every ID. I use ggplot2 to create dot plot based on counts of the value variable using geom_dotplot and faceting by group:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot") +
facet_wrap(~ group)
The dot stack in the first facet is cut off, even when exported using ggsave. Changing the y-axis limits has no effect, but changing the aspect ratio so that H >= W seems to fix the issue (usually by adding way more space to the top than necessary). Is this a bug, or am I specifying my dot plot incorrectly?
EDIT
One workaround is to flip my dotplot and bin by the y variable:
ggplot(ex) + aes(x = group, y = value) +
geom_dotplot(binwidth = 1, method = "histodot",
binaxis = "y", stackdir = "centerwhole") +
facet_wrap(~ group, scales = "free_x")
Two other parameters that can help you are stackratio and dotsize. For example:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot", stackratio = 0.9, dotsize = .75) +
facet_wrap(~ group) +
scale_y_continuous(NULL, breaks = NULL)
You would need to tweak the numbers until you got the layout you wanted.
I found an interesting workaround using geom_bar that achieves the same structure as a dot plot but with rectangles:
ggplot(ex) + aes(x = value, group = ID) +
geom_bar(color = "black", fill = "white", width = 1) +
facet_wrap(~ group)
Although it results in rectangles (rather than dots) and you can't control the stack spacing. The rectangles get resized according to the plot window, which would be equivalent to tweaking the dot size in geom_dotplot. Also, it begs the question "why not just use a regular bar plot?"

R - Convert List of Lists into single dataframe

So, I have created a list (and a single column matrix) that contains 256 nested lists. What I would like to do, is to convert each of the 256 lists into a single dataframe of 16 columns and then write.table it. Although each list contains the same number of columns (16), the number of rows for each list varies. I have tried to use unlist unsuccessfully because the changing row counts. I can subset each list individually, so I know there's an easier way to do the whole list.
I'm pretty new to R, so I apologize for asking what may be a naive novice question. I searched through a lot of topics the last couple days and didn't see anything that seemed to match my problem. for loop seems like it might be unnecessary and I wasn't sure if lapply was the correct route, either.
UPDATE: dput of first list:
list(structure(list(structure(c(2L, 11L, 15L, 8L, 7L, 3L, 6L, 10L,
1L, 1L, 18L, 13L, 14L, 19L, 16L, 17L, 4L, 5L, 9L, 12L), .Label = c("",
"Aaron Rodgers", "Andrew Quarless", "Derrick Coleman", "Doug Baldwin",
"DuJuan Harris", "Eddie Lacy", "James Starks", "Jermaine Kearse",
"John Kuhn", "Jordy Nelson", "Luke Willson", "Marshawn Lynch", "Percy
Harvin", "Randall Cobb", "Ricardo Lockette", "Robert Turbin",
"Russell Wilson", "Zach Miller"), class = "factor"), Tm =
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("GNB", "Passing", "SEA", "Tm"),
class = "factor"), Cmp = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "19",
"23", "Cmp", "Rushing"), class = "factor"), Att = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "28", "33", "Att", "Receiving"
), class = "factor"), Yds = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, NA, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "189", "191", "Yds"), class = "factor"),
TD = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "1",
"2", "TD"), class = "factor"), Int = structure(c(3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, NA, 4L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "0", "1", "Int"), class = "factor"),
Lng = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "23",
"33", "Lng"), class = "factor"), Att = structure(c(1L, 1L,
1L, 7L, 3L, 1L, 2L, 2L, NA, 8L, 7L, 4L, 5L, 1L, 1L, 6L, 1L,
1L, 1L, 1L), .Label = c("", "1", "12", "20", "4", "6", "7",
"Att"), class = "factor"), Yds = structure(c(1L, 1L, 1L,
7L, 6L, 1L, 9L, 3L, NA, 10L, 5L, 2L, 8L, 1L, 1L, 4L, 1L,
1L, 1L, 1L), .Label = c("", "110", "2", "27", "29", "34",
"37", "41", "7", "Yds"), class = "factor"), TD = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 2L, 3L, NA, 5L, 2L, 4L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("", "0", "1", "2", "TD"), class = "factor"),
Lng = structure(c(1L, 1L, 1L, 2L, 4L, 1L, 8L, 6L, NA, 9L,
3L, 7L, 5L, 1L, 1L, 8L, 1L, 1L, 1L, 1L), .Label = c("", "12",
"13", "15", "16", "2", "21", "7", "Lng"), class = "factor"),
Rec = structure(c(1L, 7L, 5L, 3L, 4L, 4L, 1L, 1L, NA, 8L,
1L, 2L, 6L, 4L, 3L, 1L, 2L, 4L, 2L, 2L), .Label = c("", "1",
"2", "3", "6", "7", "9", "Rec"), class = "factor"), Yds = structure(c(1L,
12L, 9L, 3L, 3L, 6L, 1L, 1L, NA, 13L, 1L, 4L, 10L, 8L, 7L,
1L, 5L, 4L, 11L, 2L), .Label = c("", "1", "11", "14", "15",
"26", "38", "42", "58", "59", "8", "83", "Yds"), class = "factor"),
TD = structure(c(1L, 2L, 3L, 2L, 2L, 2L, 1L, 1L, NA, 4L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L), .Label = c("", "0",
"1", "TD"), class = "factor"), Lng = structure(c(1L, 7L,
9L, 3L, 4L, 8L, 1L, 1L, NA, 14L, 1L, 5L, 11L, 10L, 11L, 1L,
6L, 12L, 13L, 2L), .Label = c("", "1", "11", "12", "14",
"15", "16", "18", "23", "24", "33", "6", "8", "Lng"), class = "factor")), .Names = c("", "Tm", "Cmp", "Att", "Yds", "TD", "Int",
"Lng", "Att", "Yds", "TD", "Lng", "Rec", "Yds", "TD", "Lng"),
row.names = c(NA, -20L ), class = "data.frame"))
So, each observation in my list is like this above and I want to convert all of the lists into their 16 column(Now that I think about it, it's 17 columns, one is just unnamed) dataframe layout and stack all the rows together in one place that I can then write.table
Let's call your list l where l[[1]] is what you have dput above.
Two easy ways from base R and from data.table
do.call("rbind", l)
data.table::rbindlist(l)
This assumes that the columns match in each list element. Your example doesn't confirm this, although you state it.

Reshape a large matrix with missing values and multiple vars of interest [duplicate]

This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I need to reorganize a large dataset into a specific format for further analysis. Right now the data are in long format, with multiple records through time for each point. I need to reshape the data so that each point has a single record, but it will add many new columns of the time-specific data. I’ve looked at previous similar posts but I need to ultimately convert several of the current variables into columns, and I can’t find an example of such. Is there a way to accomplish this in a single reshape, or will I have to do several and then concatenate the new columns back together? Another wrinkle before I post the example is that not all points were sampled at each time-step, so I need those values to show up as NA. For example, (see data below) SitePoint A1 was not sampled at all in 2012, SitePoint A10 was not sampled during the first round in 2012, but K83 was sampled all nine times.
mydatain <- structure(list(SitePoint = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L), .Label = c("A1", "A10", "K145", "K83", "T15",
"T213"), class = "factor"), Year_Rotation = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 1L, 7L), .Label = c("2010_1", "2010_2",
"2010_3", "2011_1", "2011_2", "2011_3", "2012_1", "2012_2", "2012_3"
), class = "factor"), MR_Fire = structure(c(5L, 6L, 6L, 2L, 9L,
9L, 5L, 6L, 6L, 2L, 9L, 9L, 7L, 8L, 16L, 17L, 21L, 22L, 23L,
25L, 3L, 4L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 1L,
2L, 2L, 5L, 6L, 6L, 11L, 11L, 12L, 7L, 24L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "2", "23", "24", "25",
"35", "36", "37", "39", "40", "47", "48", "49", "51", "52", "53",
"8", "9"), class = "factor"), fire_seas = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L), .Label = c("dry", "fire", "wet"
), class = "factor"), OptTSF = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L)), .Names = c("SitePoint", "Year_Rotation", "MR_Fire",
"fire_seas", "OptTSF"), row.names = c(31L, 32L, 33L, 34L, 35L,
36L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 10543L, 10544L,
10545L, 10546L, 10547L, 10548L, 10549L, 10550L, 14988L, 14989L,
14990L, 14991L, 14992L, 14993L, 14994L, 14995L, 14996L, 17370L,
17371L, 17372L, 17373L, 17374L, 17375L, 17376L, 17377L, 17378L,
19353L, 19354L), class = "data.frame")
Ultimately I need something like this:
myfinal <- structure(list(SitePoint = structure(1:6, .Label = c("A1", "A10",
"K145", "K83", "T15", "T213"), class = "factor"), MR_Fire_2010_1 = c(12L,
12L, 39L, 23L, 0L, 14L), MR_Fire_2010_2 = c(13L, 13L, 40L, 24L,
1L, NA), MR_Fire_2010_3 = c(13L, 13L, NA, 25L, 1L, NA), MR_Fire_2011_1 = c(1L,
1L, 51L, 35L, 12L, NA), MR_Fire_2011_2 = c(2L, 2L, 52L, 36L,
13L, NA), MR_Fire_2011_3 = c(2L, 2L, 53L, 37L, 13L, NA), MR_Fire_2012_1 = c(NA,
NA, 9L, 47L, 24L, 8L), MR_Fire_2012_2 = c(NA, 14L, 10L, 48L,
24L, NA), MR_Fire_2012_3 = c(NA, 15L, 11L, 49L, 25L, NA), season_2010_1 = structure(c(2L,
2L, 1L, 2L, 2L, 1L), .Label = c("dry", "fire"), class = "factor"),
season_2010_2 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2010_3 = structure(c(1L,
1L, NA, 1L, 1L, NA), .Label = "fire", class = "factor"),
season_2011_1 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2011_2 = structure(c(2L,
2L, 1L, 2L, 2L, NA), .Label = c("dry", "fire"), class = "factor"),
season_2011_3 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2012_1 = structure(c(NA,
NA, 2L, 1L, 1L, 2L), .Label = c("fire", "wet"), class = "factor"),
season_2012_2 = structure(c(NA, 1L, 2L, 1L, 1L, NA), .Label = c("fire",
"wet"), class = "factor"), season_2012_3 = structure(c(NA,
1L, 2L, 1L, 1L, NA), .Label = c("fire", "wet"), class = "factor"),
OptTSF_2010_1 = c(1L, 1L, 0L, 1L, 1L, 1L), OptTSF_2010_2 = c(1L,
1L, 0L, 1L, 1L, NA), OptTSF_2010_3 = c(1L, 1L, NA, 1L, 1L,
NA), OptTSF_2011_1 = c(1L, 1L, 0L, 0L, 1L, NA), OptTSF_2011_2 = c(1L,
1L, 0L, 0L, 1L, NA), OptTSF_2011_3 = c(1L, 1L, 0L, 0L, 1L,
NA), OptTSF_2012_1 = c(NA, NA, 1L, 0L, 0L, 1L), OptTSF_2012_2 = c(NA,
1L, 1L, 0L, 0L, NA), OptTSF_2012_3 = c(NA, 1L, 1L, 0L, 0L,
NA)), .Names = c("SitePoint", "MR_Fire_2010_1", "MR_Fire_2010_2",
"MR_Fire_2010_3", "MR_Fire_2011_1", "MR_Fire_2011_2", "MR_Fire_2011_3",
"MR_Fire_2012_1", "MR_Fire_2012_2", "MR_Fire_2012_3", "season_2010_1",
"season_2010_2", "season_2010_3", "season_2011_1", "season_2011_2",
"season_2011_3", "season_2012_1", "season_2012_2", "season_2012_3",
"OptTSF_2010_1", "OptTSF_2010_2", "OptTSF_2010_3", "OptTSF_2011_1",
"OptTSF_2011_2", "OptTSF_2011_3", "OptTSF_2012_1", "OptTSF_2012_2",
"OptTSF_2012_3"), class = "data.frame", row.names = c(NA, -6L
))
The actual dataset is about 23656 records X 15 variables, so doing it by hand is likely to cause major headaches and potential for mistakes. Any help or suggestions are appreciated. If this has been answered elsewhere, apologies. I couldn’t find anything directly applicable; everything seemed to related to three columns and only one of those being extracted as new variables. Thanks.
SP
dcast from the devel version of data.table i.e., v1.9.5 can cast multiple columns simultaneously. It can be installed from here.
library(data.table) ## v1.9.5+
dcast(setDT(mydatain), SitePoint~Year_Rotation,
value.var=c('MR_Fire', 'fire_seas', 'OptTSF'))
You can use reshape to change the structure of your dataframe from long to wide using the following code:
reshape(mydatain,timevar="Year_Rotation",idvar="SitePoint",direction="wide")

format color and legend in ggplot geom_tile of p-values

I am currently trying to make a 'heat map' using ggplot2 to display a series of p-values, but can't figure out how to tailor the actual color assignments and legend.
sampledata.m <- melt(sampledata)
sampledata.m$var2 <- as.character(sampledata.m$var2)
sampledata.m$var2 <- factor(sampledata.m$var2, levels=unique(sampledata.m$var2),ordered=TRUE)
sampledata.m$var1 <- as.character(sampledata.m$var1)
sampledata.m$var1 <- factor(sampledata.m$var1, levels=unique(sampledata.m$var1),ordered=TRUE)
This was done so that I could maintain the order of my variables.
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradientn(colours=c("light green","dark green", "black"),
values=rescale(c(0,0.0003,0.05,0.5,1)),limits=c(0,1)))
p + theme_bw(base_size = base_size) + labs(x = "", y = "") +
scale_x_discrete(expand = c(0,0)) +
theme(legend.position = "bottom", axis.ticks = element_blank(),
axis.text.x = element_text(size = base_size * 0.8, angle = 310,
hjust = 0, colour = "black"))
This creates a nice looking plot, however my legend and my color gradient don't represent the rescale that I assigned. Forgive my ignorance if this is a simple fix, but I've only been coding R for about 2 weeks now. Ideally, I would love my plot and legend to mimic the color scheme and legend labeling similar to this paper: http://www.ncbi.nlm.nih.gov/pubmed/22496159
structure(list(var1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L
), .Label = c("A", "B", "C",
"D", "E"), class = "factor"), var2 = structure(c(1L,
5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L,
8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L,
4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L,
20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L,
13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L,
2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L,
23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L), .Label = c("1", "2",
"3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21",
"22", "23", "24"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "pvalue", class = "factor"),
value = c(0.810172671, 0.596026338, 0.076550169, 0.908670635,
0.300418653, 0.051553286, 0.124196482, 0.601568833, 0.058431468,
0.341726981, 0.876674726, 0.002698295, 0.812059425, 0.068199656,
0.758383287, 0.60362134, 0.89265723, 0.246111936, 0.156348035,
0.909574522, 0.020202377, 0.388843992, 0.769441835, 0.102272916,
0.38895717, 0.882296525, 0.792438683, 0.000491393, 0.004233434,
0.202424095, 0.426941568, 0.08520186, 0.763036306, 0.602828564,
0.037278697, 0.121642743, 0.669123606, 0.974328438, 0.834329923,
0.050413697, 0.078476666, 0.387647156, 0.000540422, 0.379576632,
0.361428444, 0.502439758, 0.001326035, 0.027652693, 0.188885638,
0.579244445, 0.471985778, 0.677458228, 0.119307242, 0.364857868,
0.238260538, 0.53472206, 0.204344281, 0.291888993, 0.295809688,
0.00029, 0.005476157, 0.960975822, 0.00029, 0.055915429,
0.618284682, 0.040605253, 0.521649682, 0.421086546, 0.164333061,
0.755528982, 0.306854182, 0.012832628, 0.270393143, 0.946675764,
0.59227376, 0.112658388, 0.429091426, 0.01662083, 0.017342483,
0.065817234, 0.012140224, 0.359828816, 0.031969725, 0.00029,
0.14555102, 0.18865081, 0.00029, 0.064107531, 0.505257768,
0.070224536, 0.017082975, 0.375864198, 0.00029, 0.104103689,
0.898979883, 0.004879605, 0.003597954, 0.036722932, 0.849058218,
0.00029, 0.003739938, 0.00029, 0.00029, 0.00029, 0.008179017,
0.193870353, 0.460181712, 0.389475522, 0.00029, 0.8785017,
0.070414642, 0.584977921, 0.990764677, 0.767253318, 0.002234906,
0.051331823, 0.00446149, 0.234477639, 0.275139791)), .Names = c("var1", "var2", "variable", "value"), row.names = c(NA, -119L), class = "data.frame")
I'm not going to get into all of the theme settings you've got - as I understand it the key of your problem is the scale of the fill gradient. You can set this in scale_fill_gradient() with a log transformation:
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradient(trans = "log", low = "light green", high = "black",
breaks = c(0, 0.001, 0.05, 0.5))
dt <- data.frame(
N=letters[5:11],
a=c(0.01,0.05,0.1,0.5,1,5,10),
b=c(10,20,50,100,200,1000,2000))
dt.mlt <- melt(dt,variable.name="Cls",value.name="Val")
ggplot(dt.mlt,aes(x=N,y=Cls,fill=Val))+
geom_tile()+
scale_fill_gradient2(
low="green",high="red",mid="black",trans="log",breaks=c(0,0.01,0.1,1,10,100,1000))+
geom_text(data=dt.mlt,aes(x=N,y=Cls,label=Val))
But if I add the midpoint=10 to the scale_fill_gradient2, the picture will become:

Resources