Related
I have many data sets that have extra information beyond a certain line. The files are all csv. I would be able to loop through them and read.csv with "skip" argument to clean the top of the data, but the length of the data frames are all different. The only commonality is the "--------------- ---------------- ------ -----" line in the Total column that separates the meaningful data from summaries and extraneous info below it.
Here's how I'm reading in the data without skip = 14 (which is standard across everything).
before<-read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit",
"Total", "Measurement", "Value", "Percent", "CO2" ))
However, the ----- marker maybe a different row, but it's the first thing to hit. Here's the data before:
structure(list(CountryID = structure(c(26L, 19L, 21L, 23L, 21L,
7L, 1L, 1L, 1L, 22L, 3L, 1L, 19L, 2L, 8L, 14L, 15L, 13L, 9L,
12L, 18L, 17L, 8L, 13L, 15L, 10L, 8L, 8L, 11L, 16L, 1L, 1L, 1L,
20L, 4L, 6L, 1L, 25L, 5L, 1L, 1L, 1L, 24L, 1L), .Label = c("",
"------------", "-------------", "---------------", "------------------",
" ", "08.15.1997", "10000", "15000", "200", "2000", "2500", "3000",
"45000", "5000", "7000", "8000", "8300", "Country", "Output",
"Production", "Quantity", "Serial Output", "TOTAL SUM", "Unaccounted",
"United Nations Data"), class = "factor"), Name = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 20L, 2L, 1L, 1L, 1L, 21L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 19L, 1L, 1L, 1L, 1L), .Label = c("",
"--------------------", " ", "Bahrain", "Bangladesh", "Barbados",
"Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia",
"Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria",
"Burkina Faso", "Chad", "Name", "The Bahamas"), class = "factor"),
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "----", " ", "Code", "Type",
"Unit"), class = "factor"), Symbol = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 20L, 22L, 2L, 1L, 1L, 1L, 4L, 5L,
6L, 7L, 9L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 21L, 1L, 1L, 1L,
1L), .Label = c("", "------------", " ", "BAHM", "BAHR",
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL",
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF", "Country",
"private", "Symbol"), class = "factor"), Code = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 19L, 2L, 1L, 1L, 1L, 12L,
15L, 11L, 17L, 4L, 13L, 14L, 9L, 18L, 10L, 5L, 16L, 3L, 7L,
8L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "------------", "1504944270", "2287368539",
"2388991307", "2453202442", "2561470743", "3205402223", "3221488867",
"3230369605", "3247578406", "3712013344", "4307638090", "462793263",
"4835205752", "4854959101", "5842098895", "5932776587", "Code"
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 17L, 16L, 2L, 1L, 1L, 1L, 7L, 9L, 10L, 14L,
12L, 15L, 15L, 11L, 13L, 3L, 8L, 13L, 15L, 6L, 5L, 9L, 1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"-------------", "100", "1109", "27", "35", "40", "45", "58",
"70", "74", "77", "79", "82", "95", "Output", "Per Unit"), class = "factor"),
Total = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 25L,
24L, 2L, 1L, 1L, 1L, 18L, 5L, 17L, 8L, 23L, 20L, 6L, 9L,
7L, 11L, 12L, 13L, 19L, 15L, 14L, 10L, 3L, 16L, 1L, 1L, 1L,
16L, 1L, 1L, 1L, 21L, 1L, 3L, 22L, 4L), .Label = c("", "---------------",
"--------------- ---------------- ------ -----",
"=============== ================ ====== =====",
"126912", "147431", "170553", "175973", "203728", "230761",
"293789", "304471", "376281", "386526", "399160", "4417002",
"476025", "478030", "502999", "51012", "5610654", "56406056",
"93351", "Output", "Total"), class = "factor"), Measurement = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 1L, 1L, 1L, 3L,
9L, 3L, 4L, 10L, 9L, 6L, 4L, 5L, 10L, 7L, 9L, 4L, 8L, 10L,
9L, 1L, 1L, 1L, 1L, 1L, 11L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "--------", "20", "23", "24", "26", "27",
"28", "29", "30", "420", "Measurement"), class = "factor"),
Value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 22L,
23L, 2L, 1L, 1L, 1L, 5L, 19L, 11L, 8L, 3L, 18L, 13L, 6L,
4L, 9L, 14L, 17L, 7L, 10L, 12L, 15L, 1L, 16L, 1L, 1L, 1L,
16L, 1L, 1L, 1L, 20L, 1L, 1L, 21L, 1L), .Label = c("", "----------------",
"15150240", "15891735", "16083459", "16959919", "20350968",
"20909501", "21770264", "25121096", "27726279", "30024743",
"34069742", "34841369", "38498281", "468004111", "49524999",
"50512814", "50568702", "540650", "64506", "Country", "Value"
), class = "factor"), Percent = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 23L, 2L, 1L, 1L, 1L, 11L, 12L, 8L, 3L,
17L, 16L, 5L, 10L, 20L, 9L, 6L, 7L, 4L, 15L, 14L, 22L, 1L,
13L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 19L, 1L, 1L, 18L, 1L), .Label = c("",
"------", "102", "104", "106", "112", "126", "129", "142",
"15", "160", "177", "1775", "180", "191", "24", "25", "5640645",
"650163", "87", "887.5", "95", "Production Percent"), class = "factor"),
CO2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 14L,
2L, 1L, 1L, 1L, 9L, 4L, 9L, 7L, 4L, 5L, 4L, 7L, 4L, 9L, 4L,
11L, 4L, 12L, 10L, 4L, 1L, 6L, 1L, 1L, 1L, 8L, 1L, 1L, 1L,
3L, 1L, 1L, 13L, 1L), .Label = c("", "-----", "?", "0", "0.2",
"0.6", "1", "19.4", "2", "2.2", "4", "5", "564065", "CO2",
"Cur."), class = "factor")), class = "data.frame", row.names = c(NA,
-44L))
And here's how I'm hoping it could look:
structure(list(CountryID = c(10000L, 45000L, 5000L, 3000L, 15000L,
2500L, 8300L, 8000L, 10000L, 3000L, 5000L, 200L, 10000L, 10000L,
2000L, 7000L), Name = structure(c(16L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), .Label = c("Bahrain",
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
"Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil",
"Brunei", "Bulgaria", "Burkina Faso", "The Bahamas"), class = "factor"),
Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Unit", class = "factor"),
Symbol = structure(c(1L, 2L, 3L, 4L, 6L, 5L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L), .Label = c("BAHM", "BAHR",
"BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL",
"BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF"), class = "factor"),
Code = c(3712013344, 4835205752, 3247578406, 5842098895,
2287368539, 4307638090, 462793263, 3221488867, 5932776587,
3230369605, 2388991307, 4854959101, 1504944270, 2561470743,
3205402223, 2453202442), Unit = c(40L, 58L, 70L, 82L, 77L,
95L, 95L, 74L, 79L, 100L, 45L, 79L, 95L, 35L, 27L, 58L),
Total = c(478030L, 126912L, 476025L, 175973L, 93351L, 51012L,
147431L, 203728L, 170553L, 293789L, 304471L, 376281L, 502999L,
399160L, 386526L, 230761L), Measurement = c(20L, 29L, 20L,
23L, 30L, 29L, 26L, 23L, 24L, 30L, 27L, 29L, 23L, 28L, 30L,
29L), Value = c(16083459L, 50568702L, 27726279L, 20909501L,
15150240L, 50512814L, 34069742L, 16959919L, 15891735L, 21770264L,
34841369L, 49524999L, 20350968L, 25121096L, 30024743L, 38498281L
), Percent = c(160L, 177L, 129L, 102L, 25L, 24L, 106L, 15L,
87L, 142L, 112L, 126L, 104L, 191L, 180L, 95L), CO2 = c(2,
0, 2, 1, 0, 0.2, 0, 1, 0, 2, 0, 4, 0, 5, 2.2, 0)), class = "data.frame", row.names = c(NA,
-16L))
Can this be integrated into the read.csv argument, or is it easier to clean the bottom of it some other way.
Three thoughts:
Use readLines (as #user2554330 suggested), find/remove the specific row, filter it, then parse the text vector with read.csv, the least of the three.
before[seq_len(min(head(which(!grepl("^[^- ]+$", before$Total)),1)-1L,nrow(before))),]; a bit complicated, granted, but it does what you need (assuming that you've already filtered the first 14 rows with skip=.
Use an external script such as sed -e '1,14d;/^[ -]\+$/{g;q;} in a pipe(...)-type thing.
Read it twice. The first time, use readLines("Example.csv"), and look through the lines for the marker of the end of data. Say it's on line n. Then in the second read, use
read.csv("Example.csv", header = FALSE,
col.names = c("CountryID","Name","Type","Symbol","Code","Unit",
"Total", "Measurement", "Value", "Percent", "CO2" ), nrows = n - 1)
(or maybe nrows will need to be a different value, if you're skipping some).
Here is my dataframe:
structure(list(replicate = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L), press_id = c(1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), start_time = c(164429106370979,
164429411618825, 164429837271940, 164430399454285, 164429106370980,
164429411618826, 164429837271941, 164430399454286, 164429106370981,
164429411618827, 164429837271942, 164430399454287, 164429106370982,
164429411618828, 164429837271943, 164430399454288, 164429106370983,
164429411618829, 164429837271944, 164430399454289, 164429106370984,
164429411618830, 164429837271945, 164430399454290, 164429106370985,
164429411618831, 164429837271946, 164430399454291, 164429106370986,
164429411618832, 164429837271947, 164430399454292, 164429106370987,
164429411618833, 164429837271948, 164430399454293, 164429106370988,
164429411618834, 164429837271949, 164430399454294, 164429106370989,
164429411618835, 164429837271950, 164430399454295, 164429106370990,
164429411618836, 164429837271951, 164430399454296, 164429106370991,
164429411618837, 164429837271952, 164430399454297, 164429106370992,
164429411618838, 164429837271953, 164430399454298, 164429106370993,
164429411618839, 164429837271954, 164430399454299), end_time = c(164429182443825,
164429512525748, 164429903243170, 164430465927555, 164429182443826,
164429512525749, 164429903243171, 164430465927556, 164429182443827,
164429512525750, 164429903243172, 164430465927557, 164429182443828,
164429512525751, 164429903243173, 164430465927558, 164429182443829,
164429512525752, 164429903243174, 164430465927559, 164429182443830,
164429512525753, 164429903243175, 164430465927560, 164429182443831,
164429512525754, 164429903243176, 164430465927561, 164429182443832,
164429512525755, 164429903243177, 164430465927562, 164429182443833,
164429512525756, 164429903243178, 164430465927563, 164429182443834,
164429512525757, 164429903243179, 164430465927564, 164429182443835,
164429512525758, 164429903243180, 164430465927565, 164429182443836,
164429512525759, 164429903243181, 164430465927566, 164429182443837,
164429512525760, 164429903243182, 164430465927567, 164429182443838,
164429512525761, 164429903243183, 164430465927568, 164429182443839,
164429512525762, 164429903243184, 164430465927569)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -60L), vars = c("replicate",
"press_id"), drop = TRUE, indices = list(0L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L), group_sizes = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
replicate = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L,
11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L), press_id = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L)), class = "data.frame", row.names = c(NA,
-60L), vars = c("replicate", "press_id"), drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L,
50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L), group_sizes = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
replicate = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L,
11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L), press_id = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L)), class = "data.frame", row.names = c(NA,
-60L), vars = c("replicate", "press_id"), drop = TRUE, .Names = c("replicate",
"press_id")), .Names = c("replicate", "press_id")), .Names = c("replicate",
"press_id", "start_time", "end_time"))
I want to get the inter press_id time diff for example:
replicate press_id start_time end_time time_diff
1 1 1.644291e+14 1.644292e+14 0 (it's a first row)
1 2 1.644294e+14 1.644295e+14 1.644294e+14 - 1.644292e+14
1 3 1.644298e+14 1.644299e+14 1.644298e+14 - 1.644295e+14
1 4 1.644304e+14 1.644305e+14 .....
2 1 1.644291e+14 1.644292e+14
2 2 1.644294e+14 1.644295e+14
2 3 1.644298e+14 1.644299e+14
2 4 1.644304e+14 1.644305e+14
I am trying to do this using mutate, lag, lead and diff but without any luck. I have grouped, and ungrouped the dataset, nothing helped me.
df %>%
group_by(replicate) %>%
mutate(d = ifelse(row_number() == 1, 0, lead(start_time) - end_time))
df %>%
group_by(replicate) %>%
mutate(d = start_time - lag(end_time))
And if you want zeroes except NAs for the first row of each unique value in the replicate column, you could do:
df %>%
group_by(replicate) %>%
mutate(d = start_time - lag(end_time),
d = ifelse(is.na(d), 0, d))
Or just:
df %>%
group_by(replicate) %>%
mutate(d = ifelse(row_number() == 1, 0, start_time - lag(end_time)))
I am using the rms library and the lrm function to do a penalized logistic regression.
Just look to my data:
> dput(cs_data_train[1:50,])
structure(list(DataCRMSanoflore.Year_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.HOURS_INSCR = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Month_Sales = structure(c(9L,
2L, 5L, 9L, 4L, 7L, 3L, 9L, 7L, 12L, 3L, 3L, 12L, 3L, 3L, 6L,
3L, 4L, 5L, 8L, 8L, 1L, 4L, 10L, 9L, 5L, 4L, 9L, 2L, 12L, 9L,
4L, 4L, 3L, 6L, 8L, 6L, 4L, 12L, 5L, 6L, 9L, 7L, 9L, 1L, 9L,
7L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Date_Sales = structure(c(3L,
10L, 22L, 23L, 26L, 13L, 12L, 2L, 25L, 11L, 10L, 9L, 4L, 10L,
18L, 9L, 9L, 1L, 14L, 24L, 4L, 2L, 2L, 22L, 17L, 4L, 14L, 22L,
2L, 5L, 29L, 13L, 2L, 10L, 25L, 5L, 10L, 1L, 6L, 20L, 7L, 9L,
1L, 3L, 17L, 22L, 3L, 9L, 20L, 13L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.HOURS_INSCR.1 = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Year_Creation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Creation_Sales = structure(c(9L,
2L, 10L, 10L, 9L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L, 6L,
10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L, 10L,
4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L, 9L,
8L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Day_Creation_Sales = structure(c(11L,
15L, 2L, 31L, 26L, 23L, 5L, 2L, 25L, 16L, 10L, 13L, 7L, 3L, 18L,
9L, 8L, 27L, 18L, 24L, 6L, 2L, 4L, 16L, 17L, 12L, 15L, 22L, 10L,
5L, 1L, 14L, 2L, 10L, 5L, 5L, 10L, 25L, 6L, 5L, 28L, 8L, 10L,
18L, 17L, 22L, 31L, 9L, 21L, 22L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.Year_Validation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Validation_Sales = structure(c(9L,
2L, 10L, 11L, 10L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L,
6L, 10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L,
10L, 4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L,
9L, 9L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10", "11", "12"), class = "factor"),
DataCRMSanoflore.Day_Validation_Sales = structure(c(14L,
16L, 3L, 3L, 1L, 27L, 6L, 5L, 27L, 21L, 19L, 27L, 8L, 5L,
21L, 10L, 9L, 30L, 26L, 27L, 7L, 4L, 15L, 17L, 18L, 13L,
20L, 29L, 11L, 7L, 2L, 16L, 3L, 20L, 6L, 6L, 13L, 29L, 8L,
6L, 30L, 9L, 12L, 20L, 18L, 29L, 1L, 10L, 23L, 25L), .Label = c("01",
"02", "03", "04", "05", "06", "07", "08", "09", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "24", "25", "26", "27", "28", "29", "30", "31"
), class = "factor"), DataCRMSanoflore.AGE_CUSTUMER = c(37L,
23L, 34L, 32L, 45L, 52L, 44L, 55L, 37L, 29L, 33L, 29L, 30L,
37L, 56L, 48L, 44L, 42L, 45L, 33L, 37L, 53L, 55L, 60L, 57L,
33L, 51L, 32L, 35L, 54L, 41L, 47L, 59L, 33L, 45L, 35L, 36L,
28L, 42L, 24L, 32L, 39L, 33L, 36L, 49L, 56L, 45L, 39L, 54L,
55L), DataCRMSanoflore.MEAN_PURCHASE = c(71.75, 50.7142857142857,
18.6666666666667, 0, 0, 54.7, 0.666666666666667, 38, 6.5,
0, 83.3333333333333, 44.3333333333333, 25.7777777777778,
24.1818181818182, 23.3846153846154, 35.5294117647059, 21.6363636363636,
1.125, 6, 8.66666666666667, 18.4, 16.9285714285714, 0, 0,
36.5, 21.5, 18.5714285714286, 28.125, 101.333333333333, 0,
2, 0, 20.9166666666667, 69.1428571428571, 16.6666666666667,
1.5, 87.1666666666667, 48.25, 13.3333333333333, 20.5833333333333,
12, 0, 23, 15.1428571428571, 0, 30.4375, 30.3076923076923,
24.625, 23.4285714285714, 20.0833333333333), DataCRMSanoflore.NUMBER_GIFTS = c(1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 3L,
4L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 1L, 4L, 1L, 1L, 1L,
2L, 5L, 2L, 2L), SENSIBILITE = c(4L, 4L, 1L, 3L, 1L, 1L,
2L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 1L, 4L,
1L, 1L, 4L, 1L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L,
1L, 4L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 4L, 3L, 1L, 4L),
IMPERFECTIONS = c(4L, 3L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L,
3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 2L,
3L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L), BRILLANCE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 1L, 4L, 4L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 4L, 4L, 4L, 1L, 1L,
1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), GRAIN_PEAU = c(4L, 4L, 1L, 4L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 1L, 2L, 4L, 4L, 1L, 1L, 1L, 3L, 1L,
1L, 2L, 1L, 2L, 4L, 4L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 4L, 4L, 1L, 2L, 4L, 1L, 1L, 4L, 3L, 1L, 4L), RIDES_VISAGE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 2L, 1L, 4L, 2L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 2L, 4L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), ALLERGIES = c(2L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 2L), MAINS = c(4L,
4L, 1L, 4L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 3L, 4L, 4L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 3L, 4L, 1L, 1L,
3L, 3L, 1L, 4L), PEAU_CORPS = c(3L, 3L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 3L, 2L, 1L, 2L, 4L, 1L, 1L, 3L, 3L, 1L, 3L), INTERET_ALIM_NATURELLE = c(4L,
4L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 4L, 4L, 1L, 4L, 2L, 1L, 1L,
4L, 2L, 1L, 2L), INTERET_ORIGINE_GEO = c(4L, 2L, 1L, 2L,
1L, 1L, 5L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 5L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 5L, 5L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), INTERET_VACANCES = c(4L, 2L, 1L, 3L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 1L, 2L, 1L, 3L, 4L, 3L, 1L, 1L, 1L, 2L, 1L, 1L,
3L, 1L, 4L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 2L, 2L, 1L, 2L), INTERET_ENVIRONNEMENT = c(5L,
5L, 1L, 5L, 1L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 5L, 1L, 5L, 3L, 1L, 1L,
3L, 5L, 1L, 3L), INTERET_COMPOSITION = c(2L, 2L, 1L, 4L,
1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), DataCRMSanoflore.Nb_achats = c(4, 7, 3, 3, 4, 10, 3,
4, 14, 4, 6, 6, 9, 22, 26, 17, 22, 8, 3, 9, 10, 14, 3, 7,
12, 6, 14, 16, 3, 3, 3, 3, 12, 7, 3, 6, 6, 12, 18, 12, 15,
6, 21, 7, 6, 16, 13, 16, 14, 12), OUTCOME = structure(c(1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor")), .Names = c("DataCRMSanoflore.Year_Sales",
"DataCRMSanoflore.HOURS_INSCR", "DataCRMSanoflore.Month_Sales",
"DataCRMSanoflore.Date_Sales", "DataCRMSanoflore.HOURS_INSCR.1",
"DataCRMSanoflore.Year_Creation_Sales", "DataCRMSanoflore.Month_Creation_Sales",
"DataCRMSanoflore.Day_Creation_Sales", "DataCRMSanoflore.Year_Validation_Sales",
"DataCRMSanoflore.Month_Validation_Sales", "DataCRMSanoflore.Day_Validation_Sales",
"DataCRMSanoflore.AGE_CUSTUMER", "DataCRMSanoflore.MEAN_PURCHASE",
"DataCRMSanoflore.NUMBER_GIFTS", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "ALLERGIES", "MAINS",
"PEAU_CORPS", "INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO",
"INTERET_VACANCES", "INTERET_ENVIRONNEMENT", "INTERET_COMPOSITION",
"DataCRMSanoflore.Nb_achats", "OUTCOME"), row.names = c(22L,
33L, 40L, 48L, 54L, 59L, 74L, 78L, 87L, 89L, 104L, 115L, 121L,
141L, 159L, 161L, 163L, 165L, 196L, 202L, 211L, 222L, 272L, 300L,
318L, 325L, 327L, 349L, 374L, 380L, 392L, 393L, 394L, 398L, 427L,
440L, 449L, 456L, 470L, 477L, 479L, 490L, 505L, 508L, 514L, 520L,
528L, 531L, 534L, 543L), class = "data.frame")
Then when I want to fit the model using this code:
fit = lrm(OUTCOME ~ .-1,data = cs_data_train,x=T, y=T)
It gives an error:
singular information matrix in lrm.fit (rank= 148 ). Offending
variable(s): DataCRMSanoflore.HOURS_INSCR.1 Error in lrm(OUTCOME ~ .
- 1, data = cs_data_train, x = T, y = T) : Unable to fit model using “lrm.fit”
I searched but I could not resolve this issue. Thank you for your help!
EDIT:
As Said in the comment below. I need to remove one of each both correlated variables. So I write this code :
> highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=(0.7),verbose = FALSE)
> print(highlyCorrelated)
[1] 21 20 26 15 18 17 22 16 25 19 23 24 6 9 7 10 28 2
> important_var=colnames(DATA_BASE[,-highlyCorrelated])
> important_var
[1] "DataCRMSanoflore.Year_Sales" "DataCRMSanoflore.Date_Sales" "DataCRMSanoflore.HOURS_INSCR.1"
[4] "DataCRMSanoflore.Day_Creation_Sales" "DataCRMSanoflore.MEAN_PURCHASE" "OUTCOME"
> DATA_BASE<-DATA_BASE[,-highlyCorrelated]
> str(DATA_BASE)
'data.frame': 5775 obs. of 6 variables:
$ DataCRMSanoflore.Year_Sales : num 2 1 2 1 2 1 1 1 1 2 ...
$ DataCRMSanoflore.Date_Sales : num 13 3 10 22 23 26 13 1 12 2 ...
$ DataCRMSanoflore.HOURS_INSCR.1 : num 17 14 18 17 16 11 22 14 23 17 ...
$ DataCRMSanoflore.Day_Creation_Sales: num 13 11 15 2 31 26 23 1 5 2 ...
$ DataCRMSanoflore.MEAN_PURCHASE : num 0 71.8 50.7 18.7 0 ...
$ OUTCOME : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 1 ...
But I get then the same error
Error in lrm(OUTCOME ~ . - 1, data = train, x = T, y = T) : Unable
to fit model using “lrm.fit”
This really weird!
How can I resolve this please ?
This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I need to reorganize a large dataset into a specific format for further analysis. Right now the data are in long format, with multiple records through time for each point. I need to reshape the data so that each point has a single record, but it will add many new columns of the time-specific data. I’ve looked at previous similar posts but I need to ultimately convert several of the current variables into columns, and I can’t find an example of such. Is there a way to accomplish this in a single reshape, or will I have to do several and then concatenate the new columns back together? Another wrinkle before I post the example is that not all points were sampled at each time-step, so I need those values to show up as NA. For example, (see data below) SitePoint A1 was not sampled at all in 2012, SitePoint A10 was not sampled during the first round in 2012, but K83 was sampled all nine times.
mydatain <- structure(list(SitePoint = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L), .Label = c("A1", "A10", "K145", "K83", "T15",
"T213"), class = "factor"), Year_Rotation = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 1L, 7L), .Label = c("2010_1", "2010_2",
"2010_3", "2011_1", "2011_2", "2011_3", "2012_1", "2012_2", "2012_3"
), class = "factor"), MR_Fire = structure(c(5L, 6L, 6L, 2L, 9L,
9L, 5L, 6L, 6L, 2L, 9L, 9L, 7L, 8L, 16L, 17L, 21L, 22L, 23L,
25L, 3L, 4L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 1L,
2L, 2L, 5L, 6L, 6L, 11L, 11L, 12L, 7L, 24L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "2", "23", "24", "25",
"35", "36", "37", "39", "40", "47", "48", "49", "51", "52", "53",
"8", "9"), class = "factor"), fire_seas = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L), .Label = c("dry", "fire", "wet"
), class = "factor"), OptTSF = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L)), .Names = c("SitePoint", "Year_Rotation", "MR_Fire",
"fire_seas", "OptTSF"), row.names = c(31L, 32L, 33L, 34L, 35L,
36L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 10543L, 10544L,
10545L, 10546L, 10547L, 10548L, 10549L, 10550L, 14988L, 14989L,
14990L, 14991L, 14992L, 14993L, 14994L, 14995L, 14996L, 17370L,
17371L, 17372L, 17373L, 17374L, 17375L, 17376L, 17377L, 17378L,
19353L, 19354L), class = "data.frame")
Ultimately I need something like this:
myfinal <- structure(list(SitePoint = structure(1:6, .Label = c("A1", "A10",
"K145", "K83", "T15", "T213"), class = "factor"), MR_Fire_2010_1 = c(12L,
12L, 39L, 23L, 0L, 14L), MR_Fire_2010_2 = c(13L, 13L, 40L, 24L,
1L, NA), MR_Fire_2010_3 = c(13L, 13L, NA, 25L, 1L, NA), MR_Fire_2011_1 = c(1L,
1L, 51L, 35L, 12L, NA), MR_Fire_2011_2 = c(2L, 2L, 52L, 36L,
13L, NA), MR_Fire_2011_3 = c(2L, 2L, 53L, 37L, 13L, NA), MR_Fire_2012_1 = c(NA,
NA, 9L, 47L, 24L, 8L), MR_Fire_2012_2 = c(NA, 14L, 10L, 48L,
24L, NA), MR_Fire_2012_3 = c(NA, 15L, 11L, 49L, 25L, NA), season_2010_1 = structure(c(2L,
2L, 1L, 2L, 2L, 1L), .Label = c("dry", "fire"), class = "factor"),
season_2010_2 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2010_3 = structure(c(1L,
1L, NA, 1L, 1L, NA), .Label = "fire", class = "factor"),
season_2011_1 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2011_2 = structure(c(2L,
2L, 1L, 2L, 2L, NA), .Label = c("dry", "fire"), class = "factor"),
season_2011_3 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2012_1 = structure(c(NA,
NA, 2L, 1L, 1L, 2L), .Label = c("fire", "wet"), class = "factor"),
season_2012_2 = structure(c(NA, 1L, 2L, 1L, 1L, NA), .Label = c("fire",
"wet"), class = "factor"), season_2012_3 = structure(c(NA,
1L, 2L, 1L, 1L, NA), .Label = c("fire", "wet"), class = "factor"),
OptTSF_2010_1 = c(1L, 1L, 0L, 1L, 1L, 1L), OptTSF_2010_2 = c(1L,
1L, 0L, 1L, 1L, NA), OptTSF_2010_3 = c(1L, 1L, NA, 1L, 1L,
NA), OptTSF_2011_1 = c(1L, 1L, 0L, 0L, 1L, NA), OptTSF_2011_2 = c(1L,
1L, 0L, 0L, 1L, NA), OptTSF_2011_3 = c(1L, 1L, 0L, 0L, 1L,
NA), OptTSF_2012_1 = c(NA, NA, 1L, 0L, 0L, 1L), OptTSF_2012_2 = c(NA,
1L, 1L, 0L, 0L, NA), OptTSF_2012_3 = c(NA, 1L, 1L, 0L, 0L,
NA)), .Names = c("SitePoint", "MR_Fire_2010_1", "MR_Fire_2010_2",
"MR_Fire_2010_3", "MR_Fire_2011_1", "MR_Fire_2011_2", "MR_Fire_2011_3",
"MR_Fire_2012_1", "MR_Fire_2012_2", "MR_Fire_2012_3", "season_2010_1",
"season_2010_2", "season_2010_3", "season_2011_1", "season_2011_2",
"season_2011_3", "season_2012_1", "season_2012_2", "season_2012_3",
"OptTSF_2010_1", "OptTSF_2010_2", "OptTSF_2010_3", "OptTSF_2011_1",
"OptTSF_2011_2", "OptTSF_2011_3", "OptTSF_2012_1", "OptTSF_2012_2",
"OptTSF_2012_3"), class = "data.frame", row.names = c(NA, -6L
))
The actual dataset is about 23656 records X 15 variables, so doing it by hand is likely to cause major headaches and potential for mistakes. Any help or suggestions are appreciated. If this has been answered elsewhere, apologies. I couldn’t find anything directly applicable; everything seemed to related to three columns and only one of those being extracted as new variables. Thanks.
SP
dcast from the devel version of data.table i.e., v1.9.5 can cast multiple columns simultaneously. It can be installed from here.
library(data.table) ## v1.9.5+
dcast(setDT(mydatain), SitePoint~Year_Rotation,
value.var=c('MR_Fire', 'fire_seas', 'OptTSF'))
You can use reshape to change the structure of your dataframe from long to wide using the following code:
reshape(mydatain,timevar="Year_Rotation",idvar="SitePoint",direction="wide")
I am currently trying to make a 'heat map' using ggplot2 to display a series of p-values, but can't figure out how to tailor the actual color assignments and legend.
sampledata.m <- melt(sampledata)
sampledata.m$var2 <- as.character(sampledata.m$var2)
sampledata.m$var2 <- factor(sampledata.m$var2, levels=unique(sampledata.m$var2),ordered=TRUE)
sampledata.m$var1 <- as.character(sampledata.m$var1)
sampledata.m$var1 <- factor(sampledata.m$var1, levels=unique(sampledata.m$var1),ordered=TRUE)
This was done so that I could maintain the order of my variables.
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradientn(colours=c("light green","dark green", "black"),
values=rescale(c(0,0.0003,0.05,0.5,1)),limits=c(0,1)))
p + theme_bw(base_size = base_size) + labs(x = "", y = "") +
scale_x_discrete(expand = c(0,0)) +
theme(legend.position = "bottom", axis.ticks = element_blank(),
axis.text.x = element_text(size = base_size * 0.8, angle = 310,
hjust = 0, colour = "black"))
This creates a nice looking plot, however my legend and my color gradient don't represent the rescale that I assigned. Forgive my ignorance if this is a simple fix, but I've only been coding R for about 2 weeks now. Ideally, I would love my plot and legend to mimic the color scheme and legend labeling similar to this paper: http://www.ncbi.nlm.nih.gov/pubmed/22496159
structure(list(var1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L
), .Label = c("A", "B", "C",
"D", "E"), class = "factor"), var2 = structure(c(1L,
5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L,
8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L,
4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L,
20L, 6L, 21L, 11L, 2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L,
13L, 24L, 1L, 5L, 23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L,
2L, 22L, 10L, 3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L, 24L, 1L, 5L,
23L, 18L, 9L, 8L, 14L, 12L, 20L, 6L, 21L, 11L, 2L, 22L, 10L,
3L, 19L, 16L, 4L, 7L, 15L, 17L, 13L), .Label = c("1", "2",
"3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21",
"22", "23", "24"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "pvalue", class = "factor"),
value = c(0.810172671, 0.596026338, 0.076550169, 0.908670635,
0.300418653, 0.051553286, 0.124196482, 0.601568833, 0.058431468,
0.341726981, 0.876674726, 0.002698295, 0.812059425, 0.068199656,
0.758383287, 0.60362134, 0.89265723, 0.246111936, 0.156348035,
0.909574522, 0.020202377, 0.388843992, 0.769441835, 0.102272916,
0.38895717, 0.882296525, 0.792438683, 0.000491393, 0.004233434,
0.202424095, 0.426941568, 0.08520186, 0.763036306, 0.602828564,
0.037278697, 0.121642743, 0.669123606, 0.974328438, 0.834329923,
0.050413697, 0.078476666, 0.387647156, 0.000540422, 0.379576632,
0.361428444, 0.502439758, 0.001326035, 0.027652693, 0.188885638,
0.579244445, 0.471985778, 0.677458228, 0.119307242, 0.364857868,
0.238260538, 0.53472206, 0.204344281, 0.291888993, 0.295809688,
0.00029, 0.005476157, 0.960975822, 0.00029, 0.055915429,
0.618284682, 0.040605253, 0.521649682, 0.421086546, 0.164333061,
0.755528982, 0.306854182, 0.012832628, 0.270393143, 0.946675764,
0.59227376, 0.112658388, 0.429091426, 0.01662083, 0.017342483,
0.065817234, 0.012140224, 0.359828816, 0.031969725, 0.00029,
0.14555102, 0.18865081, 0.00029, 0.064107531, 0.505257768,
0.070224536, 0.017082975, 0.375864198, 0.00029, 0.104103689,
0.898979883, 0.004879605, 0.003597954, 0.036722932, 0.849058218,
0.00029, 0.003739938, 0.00029, 0.00029, 0.00029, 0.008179017,
0.193870353, 0.460181712, 0.389475522, 0.00029, 0.8785017,
0.070414642, 0.584977921, 0.990764677, 0.767253318, 0.002234906,
0.051331823, 0.00446149, 0.234477639, 0.275139791)), .Names = c("var1", "var2", "variable", "value"), row.names = c(NA, -119L), class = "data.frame")
I'm not going to get into all of the theme settings you've got - as I understand it the key of your problem is the scale of the fill gradient. You can set this in scale_fill_gradient() with a log transformation:
p <- ggplot(sampledata.m, aes(var2, var1)) +
geom_tile(aes(fill = value), colour = "transparent") +
scale_fill_gradient(trans = "log", low = "light green", high = "black",
breaks = c(0, 0.001, 0.05, 0.5))
dt <- data.frame(
N=letters[5:11],
a=c(0.01,0.05,0.1,0.5,1,5,10),
b=c(10,20,50,100,200,1000,2000))
dt.mlt <- melt(dt,variable.name="Cls",value.name="Val")
ggplot(dt.mlt,aes(x=N,y=Cls,fill=Val))+
geom_tile()+
scale_fill_gradient2(
low="green",high="red",mid="black",trans="log",breaks=c(0,0.01,0.1,1,10,100,1000))+
geom_text(data=dt.mlt,aes(x=N,y=Cls,label=Val))
But if I add the midpoint=10 to the scale_fill_gradient2, the picture will become: