Need to fill data frame with missing years and value in 2nd column - r

I have data frames with counts from a series of years, 1970-2020, generated by a subset command from a larger data set, i.e. resulting in two columns "Year" and "Count":
Year Count
1987 8
1989 1
1991 1
1992 4
1995 11
1996 3
1997 7
.
.
.
2019 2
2020 5
There are missing years where Count=0, and I need a procedure to fill these df's with the missing years and Count=0. I have this script that I can't get to work:
library(tidyr)
aug <- subset(mainframe, month==8)
complete(aug, year = 1987:2020, fill = list(Count = 0))
Here's a sample dataframe 'aug':
dput(aug)
structure(list(month = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), .Label = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), class = "factor"), year = structure(1:28, .Label = c("1987",
"1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995",
"1996", "1998", "2000", "2001", "2002", "2003", "2004", "2005",
"2006", "2007", "2008", "2009", "2010", "2011", "2013", "2015",
"2016", "2018", "2020"), class = "factor"), Count = c(4L, 0L,
3L, 3L, 0L, 0L, 1L, 0L, 1L, 1L, 3L, 0L, 0L, 0L, 0L, 2L, 0L, 0L,
0L, 2L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L)), row.names = c(8L, 20L,
32L, 44L, 56L, 68L, 80L, 92L, 104L, 116L, 128L, 140L, 152L, 164L,
176L, 188L, 200L, 212L, 224L, 236L, 248L, 260L, 272L, 284L, 296L,
308L, 320L, 332L), class = "data.frame")

If I get your question correctly, you want to have a complete dataframe containing year 1987 to 2020, but there are some missing years in your aug dataframe, and you want to fill in the missing years with month = "8" and Count = 0.
Here's a tidyverse approach (in your original aug dataframe, your year is factor, so at the end of my solution I also transformed it into factor):
Your dataset
month year Count
8 8 1987 4
20 8 1988 0
32 8 1989 3
44 8 1990 3
56 8 1991 0
68 8 1992 0
80 8 1993 1
92 8 1994 0
104 8 1995 1
116 8 1996 1
128 8 1998 3
140 8 2000 0
152 8 2001 0
164 8 2002 0
176 8 2003 0
188 8 2004 2
200 8 2005 0
212 8 2006 0
224 8 2007 0
236 8 2008 2
248 8 2009 0
260 8 2010 1
272 8 2011 1
284 8 2013 0
296 8 2015 0
308 8 2016 1
320 8 2018 0
332 8 2020 1
Solution
library(tidyverse)
aug %>% mutate(year = as.numeric(as.character(year))) %>%
complete(year = first(year):max(year), # or year = 1987:2020
fill = list(month = "8", Count = 0)) %>%
mutate(year = as.factor(year))
Output
year month Count
1987 8 4
1988 8 0
1989 8 3
1990 8 3
1991 8 0
1992 8 0
1993 8 1
1994 8 0
1995 8 1
1996 8 1
1997 8 0
1998 8 3
1999 8 0
2000 8 0
2001 8 0
2002 8 0
2003 8 0
2004 8 2
2005 8 0
2006 8 0
2007 8 0
2008 8 2
2009 8 0
2010 8 1
2011 8 1
2012 8 0
2013 8 0
2014 8 0
2015 8 0
2016 8 1
2017 8 0
2018 8 0
2019 8 0
2020 8 1

Related

use pivot_longer to created multiple value columns

Can you specify multiple value columns in pivot_longer()?
My original data (posted below) I had to transpose to be in a wider format. Then I want to take this new transposed data and return it to the original format (lets assume I did some transformations/ and can't use the original data). To re-transpose back into a longer format I have to use both pivot_longer() then pivot_wider() because there are multiple values I want to be their own columns.
I would like to avoid the pivot_wider() and just use pivot_longer() when re-transposing the data back if possible.
As a side note the unique identifier for each row is the combination of id and report.
Code
dfa <- dfx %>%
pivot_wider(
id_cols = id,
names_from = report,
values_from = c(pts,
p1, p2, p3,p4,p5,
d1,d2,d3,d4,d5)
)
df_return <- dfa %>%
pivot_longer(cols = !id,
names_to = c('vars','report'),
names_pattern = "([a-z0-9]+)_(.*)",
values_drop_na = TRUE) %>%
pivot_wider(id_cols = c(id, report),
names_from = vars,
values_from = value)
Data
structure(list(pts = c(431L, 167L, 167L, 760L, 348L, 768L, 619L,
169L, 416L, 155L, 47L, 37L, 6L, 17L, 22L, 1L, 149L, 3L, 284L,
7L), d1 = c(129L, 48L, 52L, 166L, 90L, 178L, 184L, 20L, 158L,
42L, 3L, 15L, 2L, 7L, 9L, 0L, 54L, 1L, 69L, 6L), d2 = c(172L,
67L, 64L, 257L, 132L, 255L, 261L, 30L, 201L, 61L, 9L, 20L, 2L,
9L, 12L, 0L, 69L, 1L, 123L, 6L), d3 = c(205L, 77L, 73L, 312L,
153L, 307L, 310L, 39L, 235L, 70L, 12L, 21L, 2L, 10L, 12L, 0L,
77L, 2L, 139L, 6L), d4 = c(227L, 81L, 82L, 363L, 177L, 350L,
342L, 52L, 257L, 75L, 15L, 24L, 2L, 12L, 13L, 0L, 86L, 2L, 151L,
6L), d5 = c(248L, 88L, 92L, 414L, 192L, 387L, 374L, 66L, 279L,
86L, 16L, 26L, 2L, 12L, 15L, 0L, 90L, 3L, 164L, 7L), report = c("2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006"), p1 = c(1.0360364394094, 1.22979866735429,
1.21423740998677, 0.87891144382145, 0.810310827130179, 0.965901663505148,
1.02621739486337, 0.69319116444678, 1.18938130906092, 1.04220816515009,
0.683545688193799, 1.05179228560845, 1.51468104603873, 1.15200888955888,
0.948041330809858, 0, 1.23227405154205, 3.11155226007598, 0.908056299174703,
1.57712371536702), p2 = c(0.986884800185635, 1.23066225499351,
1.07336930339221, 0.966734485786667, 0.87421381769247, 0.974775549615439,
1.06274655160121, 0.705150638862953, 1.12934487417415, 1.10234720984265,
1.11084642794988, 1.06558505521222, 1.0197697665798, 1.15605466288868,
1.01469386643771, 0, 1.17689541437029, 1.42783711234222, 1.16124019281912,
1.27756288696848), p3 = c(0.993575954694177, 1.17968893104311,
1.02608313159672, 0.965200422661265, 0.862910478266102, 0.976436243011877,
1.06679768502287, 0.722966824498357, 1.12591016481614, 1.05867627021151,
1.11227024088529, 0.98275117259764, 0.803738347803303, 1.09341228936369,
0.878291424560146, 0, 1.10500006213832, 1.93128861370172, 1.0949534752299,
1.14755029569502), p4 = c(0.986244633210798, 1.08520792731261,
1.01128789684232, 0.977245321880205, 0.89785754450165, 0.981536130349165,
1.04454959427709, 0.807825580390444, 1.1035817255901, 1.00192975678877,
1.14371311954082, 1.02812279984398, 0.66742040677939, 1.15526702119886,
0.878479047328667, 0, 1.10559111180852, 1.4717526513624, 1.05479137550321,
1.07005088091939), p5 = c(0.992583778223324, 1.06016737802091,
1.02253158347207, 1.00026491073882, 0.896290873874826, 0.985549150023704,
1.04187931404895, 0.886647217836043, 1.09837506943384, 1.0323002052873,
1.05833769015682, 1.05042831618603, 0.592515872759586, 1.05106420250504,
0.961672664191663, 0, 1.05868657273466, 1.81304485775152, 1.04168095802127,
1.19437925124365), id = c("ID 1", "ID 2", "ID 3", "ID 4", "ID 5",
"ID 6", "ID 7", "ID 8", "ID 9", "ID 10", "ID 11", "ID 12", "ID 13",
"ID 14", "ID 15", "ID 16", "ID 17", "ID 18", "ID 19", "ID 20"
)), row.names = c(NA, 20L), class = "data.frame")
We may need the .value in the names_to, which selects the prefix part of the column name before the _ as the column value and the 'report' will return the suffix column name
library(tidyr)
pivot_longer(dfa, cols = -id, names_to = c(".value", "report"),
names_sep = "_")
-output
# A tibble: 20 × 13
id report pts p1 p2 p3 p4 p5 d1 d2 d3 d4 d5
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID 1 2006 431 1.04 0.987 0.994 0.986 0.993 129 172 205 227 248
2 ID 2 2006 167 1.23 1.23 1.18 1.09 1.06 48 67 77 81 88
3 ID 3 2006 167 1.21 1.07 1.03 1.01 1.02 52 64 73 82 92
4 ID 4 2006 760 0.879 0.967 0.965 0.977 1.00 166 257 312 363 414
5 ID 5 2006 348 0.810 0.874 0.863 0.898 0.896 90 132 153 177 192
6 ID 6 2006 768 0.966 0.975 0.976 0.982 0.986 178 255 307 350 387
7 ID 7 2006 619 1.03 1.06 1.07 1.04 1.04 184 261 310 342 374
8 ID 8 2006 169 0.693 0.705 0.723 0.808 0.887 20 30 39 52 66
9 ID 9 2006 416 1.19 1.13 1.13 1.10 1.10 158 201 235 257 279
10 ID 10 2006 155 1.04 1.10 1.06 1.00 1.03 42 61 70 75 86
11 ID 11 2006 47 0.684 1.11 1.11 1.14 1.06 3 9 12 15 16
12 ID 12 2006 37 1.05 1.07 0.983 1.03 1.05 15 20 21 24 26
13 ID 13 2006 6 1.51 1.02 0.804 0.667 0.593 2 2 2 2 2
14 ID 14 2006 17 1.15 1.16 1.09 1.16 1.05 7 9 10 12 12
15 ID 15 2006 22 0.948 1.01 0.878 0.878 0.962 9 12 12 13 15
16 ID 16 2006 1 0 0 0 0 0 0 0 0 0 0
17 ID 17 2006 149 1.23 1.18 1.11 1.11 1.06 54 69 77 86 90
18 ID 18 2006 3 3.11 1.43 1.93 1.47 1.81 1 1 2 2 3
19 ID 19 2006 284 0.908 1.16 1.09 1.05 1.04 69 123 139 151 164
20 ID 20 2006 7 1.58 1.28 1.15 1.07 1.19 6 6 6 6 7

Dividing non-equal dataframes based on a group condition

I have two dataframes, with a similar strucure:
df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L,
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L,
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L,
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L),
totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 9 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 0 4 4 11
2 2006 1110 8 10 2 6 26
3 2006 1120 2 0 1 0 3
4 2006 1130 2 0 2 3 7
5 2006 1220 0 0 0 1 1
6 2006 1230 2 2 4 3 9
7 2006 1310 0 1 0 0 7
8 2006 1320 1 3 0 0 7
9 2006 1330 0 1 3 3 6
df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L,
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L,
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L,
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L,
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053,
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 15806 9681 10457 7109 43053
2 2006 1110 655 337 221 128 1341
3 2006 1120 105 68 40 5 218
4 2006 1130 328 215 123 64 730
5 2006 1210 138 97 65 56 356
6 2006 1220 452 357 325 256 1390
7 2006 1230 445 366 322 240 1373
8 2006 1310 471 245 151 83 950
9 2006 1320 672 440 352 274 1738
10 2006 1330 615 360 332 192 1499
I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.
What is the best way to divide this unequal dataframe?
Desired outcome:
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 /15806 0/9681 4/10457 4/7109 11/43053
You could subset the second data frame using %in%, assuming both code columns are properly ordered.
cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
# year code totaal 1 2 3 4
# 1 2006 0 11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110 26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120 3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130 7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220 1 0.000000e+00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230 9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310 7 0.000000e+00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320 7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330 6 0.000000e+00 0.002777778 0.0090361446 0.015625000
You could use complete to make the number of rows between the two data frames equal, and then do the division:
library(tidyr)
df_I %<>%
complete(code = df_II$code) %>%
fill(year) %>%
replace(is.na(.), 0)
cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
code year `1` `2` `3` `4` totaal
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 2006 0.0000633 0 0.000383 0.000563 0.000255
2 1110 2006 0.0122 0.0297 0.00905 0.0469 0.0194
3 1120 2006 0.0190 0 0.025 0 0.0138
4 1130 2006 0.00610 0 0.0163 0.0469 0.00959
5 1210 2006 0 0 0 0 0
6 1220 2006 0 0 0 0.00391 0.000719
7 1230 2006 0.00449 0.00546 0.0124 0.0125 0.00655
8 1310 2006 0 0.00408 0 0 0.00737
9 1320 2006 0.00149 0.00682 0 0 0.00403
10 1330 2006 0 0.00278 0.00904 0.0156 0.00400

How do I duplicate and add rows between the values of two different columns?

I'm trying to duplicate each observation for all of the years that fall between "styear" and "endyear." So, for example, there should end up being 118 USA rows with years 1898-2016.
Here's the data:
# A tibble: 14 x 9
stateabb ccode styear stmonth stday endyear endmonth endday version
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 USA 2 1898 8 13 2016 12 31 2016
2 UKG 200 1816 1 1 2016 12 31 2016
3 FRN 220 1816 1 1 1940 6 22 2016
4 FRN 220 1945 8 15 2016 12 31 2016
5 GMY 255 1816 1 1 1918 11 11 2016
6 GMY 255 1925 1 1 1945 5 7 2016
7 GMY 255 1991 12 11 2016 12 31 2016
8 AUH 300 1816 1 1 1918 11 3 2016
9 ITA 325 1860 1 1 1943 9 2 2016
10 RUS 365 1816 1 1 1917 12 5 2016
11 RUS 365 1922 1 1 2016 12 31 2016
12 CHN 710 1950 1 1 2016 12 31 2016
13 JPN 740 1895 4 1 1945 8 14 2016
14 JPN 740 1991 12 11 2016 12 31 2016
I have tried various combinations of slice() and seq() and complete(), but I think I'm just too new at coding to a) know what to do and b) how to really understand other answers to similar questions.
Ultimately, I am merging this data with other data and creating 0/1 dummy variable to indicate if a country was a "great power" in a given year. The easiest way I thought of was to do this by creating individual rows for each year a country was a great power (the data in this question) because the data I am merging it with is also in the country-year format. I am open to other options, though, if something else is easier!
Thank you!
I think tidyr::expand() and full_seq() can achieve what you want, with grouping on stateabb and styear since you have multiple start years for some states.
Assuming your data frame is named mydata, something like this. I have retained the column of expanded years and named it filled_year, but you may want to remove it.
library(dplyr)
library(tidyr)
new_data <- mydata %>%
group_by(stateabb, styear) %>%
tidyr::expand(stateabb, full_seq(c(styear, endyear), 1)) %>%
inner_join(mydata) %>%
rename(filled_year = `full_seq(c(styear, endyear), 1)`) %>%
ungroup()
The top and bottom of the USA rows:
new_data %>%
filter(stateabb == "USA") %>%
head()
# A tibble: 6 x 10
styear stateabb filled_year ccode stmonth stday endyear endmonth endday version
<int> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int>
1 1898 USA 1898 2 8 13 2016 12 31 2016
2 1898 USA 1899 2 8 13 2016 12 31 2016
3 1898 USA 1900 2 8 13 2016 12 31 2016
4 1898 USA 1901 2 8 13 2016 12 31 2016
5 1898 USA 1902 2 8 13 2016 12 31 2016
6 1898 USA 1903 2 8 13 2016 12 31 2016
new_data %>%
filter(stateabb == "USA") %>%
tail()
# A tibble: 6 x 10
styear stateabb filled_year ccode stmonth stday endyear endmonth endday version
<int> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int>
1 1898 USA 2011 2 8 13 2016 12 31 2016
2 1898 USA 2012 2 8 13 2016 12 31 2016
3 1898 USA 2013 2 8 13 2016 12 31 2016
4 1898 USA 2014 2 8 13 2016 12 31 2016
5 1898 USA 2015 2 8 13 2016 12 31 2016
6 1898 USA 2016 2 8 13 2016 12 31 2016
Your example data:
mydata <- structure(list(stateabb = c("USA", "UKG", "FRN", "FRN", "GMY",
"GMY", "GMY", "AUH", "ITA", "RUS", "RUS", "CHN", "JPN", "JPN"
), ccode = c(2L, 200L, 220L, 220L, 255L, 255L, 255L, 300L, 325L,
365L, 365L, 710L, 740L, 740L), styear = c(1898L, 1816L, 1816L,
1945L, 1816L, 1925L, 1991L, 1816L, 1860L, 1816L, 1922L, 1950L,
1895L, 1991L), stmonth = c(8L, 1L, 1L, 8L, 1L, 1L, 12L, 1L, 1L,
1L, 1L, 1L, 4L, 12L), stday = c(13L, 1L, 1L, 15L, 1L, 1L, 11L,
1L, 1L, 1L, 1L, 1L, 1L, 11L), endyear = c(2016L, 2016L, 1940L,
2016L, 1918L, 1945L, 2016L, 1918L, 1943L, 1917L, 2016L, 2016L,
1945L, 2016L), endmonth = c(12L, 12L, 6L, 12L, 11L, 5L, 12L,
11L, 9L, 12L, 12L, 12L, 8L, 12L), endday = c(31L, 31L, 22L, 31L,
11L, 7L, 31L, 3L, 2L, 5L, 31L, 31L, 14L, 31L), version = c(2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14"))
My guess is that there is a better way to do this, but here is one way with a small reduced data set. The steps are
Make a minimal dataset.
Make function that creates the seq of dates that you want
Split the dataframe into a list of small dataframes by country with dplyr group_split.
Apply a function using purrr map that maps your list into new expanded date dataframes.
Bind list of dataframes back into one dataframe.
library(dplyr)
library(purrr )
df<-data.frame(
stringsAsFactors = FALSE,
stateabb = c("USA", "UKG"),
styear = c(1898L, 1816L),
endyear = c(2016L, 2016L)
)
expand_dates<-function(df) {
stateabb<-df %>% pluck("stateabb")
styear<-df %>% pluck("styear")
endyear<-df%>% pluck("endyear")
years=seq(styear,endyear )
data.frame(years) %>%
mutate(stateabb=stateabb,styear=styear,endyear=endyear)
}
df_new<-df %>%
group_split(stateabb)%>%
map(expand_dates) %>%
bind_rows()
head(df_new)
#> years stateabb styear endyear
#> 1 1816 UKG 1816 2016
#> 2 1817 UKG 1816 2016
#> 3 1818 UKG 1816 2016
#> 4 1819 UKG 1816 2016
#> 5 1820 UKG 1816 2016
#> 6 1821 UKG 1816 2016
Created on 2022-01-13 by the reprex package (v2.0.1)

R programming - data frame manoevur

Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")

Merging output in R

max=aggregate(cbind(a$VALUE,Date=a$DATE) ~ format(a$DATE, "%m") + cut(a$CLASS, breaks=c(0,2,4,6,8,10,12,14)) , data = a, max)[-1]
max$DATE=as.Date(max$DATE, origin = "1970-01-01")
Sample Data :
DATE GRADE VALUE
2008-09-01 1 20
2008-09-02 2 30
2008-09-03 3 50
.
.
2008-09-30 2 75
.
.
2008-10-01 1 95
.
.
2008-11-01 4 90
.
.
2008-12-01 1 70
2008-12-02 2 40
2008-12-28 4 30
2008-12-29 1 40
2008-12-31 3 50
My Expected output according to above table for only first month is :
DATE GRADE VALUE
2008-09-30 (0,2] 75
2008-09-02 (2,4] 50
Output in my real data :
format(DATE, "%m")
1 09
2 10
3 11
4 12
5 09
6 10
7 11
cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14)) value
1 (0,2] 0.30844444
2 (0,2] 1.00000000
3 (0,2] 1.00000000
4 (0,2] 0.73333333
5 (2,4] 0.16983488
6 (2,4] 0.09368000
7 (2,4] 0.10589335
Date
1 2008-09-30
2 2008-10-31
3 2008-11-28
4 2008-12-31
5 2008-09-30
6 2008-10-31
7 2008-11-28
The output is not according to the sample data , as the data is too big . A simple logic is that there are grades from 1 to 10 , so I want to find the highest value for a month in the corresponding grade groups . Eg : I need a highest value for each group (0,2],(0,4] etc
I used an aggregate condition with function max and two grouping it by two columns Date and Grade . Now when I run the code and display the value of max , I get 3 tables as output one after the other. Now I want to plot this output but i am not able to do that because of this .So how can i merge all these output ?
Try:
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), GRADE=cut(GRADE, breaks=seq(0,14,by=2))) %>%
summarise_each(funs(max))
# MONTH GRADE DATE VALUE
#1 09 (0,2] 2008-09-30 75
#2 09 (2,4] 2008-09-03 50
#3 10 (0,2] 2008-10-01 95
#4 11 (2,4] 2008-11-01 90
#5 12 (0,2] 2008-12-29 70
#6 12 (2,4] 2008-12-31 50
Or using data.table
library(data.table)
setDT(a)[, list(DATE=max(DATE), VALUE=max(VALUE)),
by= list(MONTH=format(DATE, "%m"),
GRADE=cut(GRADE, breaks=seq(0,14, by=2)))]
# MONTH GRADE DATE VALUE
#1: 09 (0,2] 2008-09-30 75
#2: 09 (2,4] 2008-09-03 50
#3: 10 (0,2] 2008-10-01 95
#4: 11 (2,4] 2008-11-01 90
#5: 12 (0,2] 2008-12-29 70
#6: 12 (2,4] 2008-12-31 50
Or using aggregate
res <- transform(with(a,
aggregate(cbind(VALUE, DATE),
list(MONTH=format(DATE, "%m") ,GRADE=cut(GRADE, breaks=seq(0,14, by=2))), max)),
DATE=as.Date(DATE, origin="1970-01-01"))
res[order(res$MONTH),]
# MONTH GRADE VALUE DATE
#1 09 (0,2] 75 2008-09-30
#4 09 (2,4] 50 2008-09-03
#2 10 (0,2] 95 2008-10-01
#5 11 (2,4] 90 2008-11-01
#3 12 (0,2] 70 2008-12-29
#6 12 (2,4] 50 2008-12-31
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244), class = "Date"),
GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c(NA, -11L), class = "data.frame")
Update
If you want to include YEAR also in the grouping
library(dplyr)
a %>%
group_by(MONTH=format(DATE, "%m"), YEAR=format(DATE, "%Y"), GRADE=cut(GRADE, breaks=seq(0,14, by=2)))%>%
summarise_each(funs(max))
# MONTH YEAR GRADE DATE VALUE
#1 09 2008 (0,2] 2008-09-30 75
#2 09 2008 (2,4] 2008-09-03 50
#3 09 2009 (0,2] 2009-09-30 75
#4 09 2009 (2,4] 2009-09-03 50
#5 10 2008 (0,2] 2008-10-01 95
#6 10 2009 (0,2] 2009-10-01 95
#7 11 2008 (2,4] 2008-11-01 90
#8 11 2009 (2,4] 2009-11-01 90
#9 12 2008 (0,2] 2008-12-29 70
#10 12 2008 (2,4] 2008-12-31 50
#11 12 2009 (0,2] 2009-12-29 70
#12 12 2009 (2,4] 2009-12-31 50
data
a <- structure(list(DATE = structure(c(14123, 14124, 14125, 14152,
14153, 14184, 14214, 14215, 14241, 14242, 14244, 14488, 14489,
14490, 14517, 14518, 14549, 14579, 14580, 14606, 14607, 14609
), class = "Date"), GRADE = c(1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L,
4L, 1L, 3L, 1L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 4L, 1L, 3L), VALUE = c(20L,
30L, 50L, 75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L, 20L, 30L, 50L,
75L, 95L, 90L, 70L, 40L, 30L, 40L, 50L)), .Names = c("DATE",
"GRADE", "VALUE"), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "21", "31", "41", "51", "61",
"71", "81", "91", "101", "111"), class = "data.frame")
Following code using base R may be helpful (using 'a' dataframe from akrun's answer):
xx = strsplit(as.character(a$DATE), '-')
a$month = sapply(strsplit(as.character(a$DATE), '-'),'[',2)
gradeCats = cut(a$GRADE, breaks = c(0, 2, 4, 6, 8, 10, 12, 14))
aggregate(VALUE~month+gradeCats, data= a, max)
month gradeCats VALUE
1 09 (0,2] 75
2 10 (0,2] 95
3 12 (0,2] 70
4 09 (2,4] 50
5 11 (2,4] 90
6 12 (2,4] 50

Resources