Dividing non-equal dataframes based on a group condition - r

I have two dataframes, with a similar strucure:
df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L,
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L,
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L,
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L),
totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 9 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 0 4 4 11
2 2006 1110 8 10 2 6 26
3 2006 1120 2 0 1 0 3
4 2006 1130 2 0 2 3 7
5 2006 1220 0 0 0 1 1
6 2006 1230 2 2 4 3 9
7 2006 1310 0 1 0 0 7
8 2006 1320 1 3 0 0 7
9 2006 1330 0 1 3 3 6
df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L,
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L,
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L,
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L,
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053,
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 15806 9681 10457 7109 43053
2 2006 1110 655 337 221 128 1341
3 2006 1120 105 68 40 5 218
4 2006 1130 328 215 123 64 730
5 2006 1210 138 97 65 56 356
6 2006 1220 452 357 325 256 1390
7 2006 1230 445 366 322 240 1373
8 2006 1310 471 245 151 83 950
9 2006 1320 672 440 352 274 1738
10 2006 1330 615 360 332 192 1499
I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.
What is the best way to divide this unequal dataframe?
Desired outcome:
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 /15806 0/9681 4/10457 4/7109 11/43053

You could subset the second data frame using %in%, assuming both code columns are properly ordered.
cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
# year code totaal 1 2 3 4
# 1 2006 0 11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110 26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120 3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130 7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220 1 0.000000e+00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230 9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310 7 0.000000e+00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320 7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330 6 0.000000e+00 0.002777778 0.0090361446 0.015625000

You could use complete to make the number of rows between the two data frames equal, and then do the division:
library(tidyr)
df_I %<>%
complete(code = df_II$code) %>%
fill(year) %>%
replace(is.na(.), 0)
cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
code year `1` `2` `3` `4` totaal
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 2006 0.0000633 0 0.000383 0.000563 0.000255
2 1110 2006 0.0122 0.0297 0.00905 0.0469 0.0194
3 1120 2006 0.0190 0 0.025 0 0.0138
4 1130 2006 0.00610 0 0.0163 0.0469 0.00959
5 1210 2006 0 0 0 0 0
6 1220 2006 0 0 0 0.00391 0.000719
7 1230 2006 0.00449 0.00546 0.0124 0.0125 0.00655
8 1310 2006 0 0.00408 0 0 0.00737
9 1320 2006 0.00149 0.00682 0 0 0.00403
10 1330 2006 0 0.00278 0.00904 0.0156 0.00400

Related

use pivot_longer to created multiple value columns

Can you specify multiple value columns in pivot_longer()?
My original data (posted below) I had to transpose to be in a wider format. Then I want to take this new transposed data and return it to the original format (lets assume I did some transformations/ and can't use the original data). To re-transpose back into a longer format I have to use both pivot_longer() then pivot_wider() because there are multiple values I want to be their own columns.
I would like to avoid the pivot_wider() and just use pivot_longer() when re-transposing the data back if possible.
As a side note the unique identifier for each row is the combination of id and report.
Code
dfa <- dfx %>%
pivot_wider(
id_cols = id,
names_from = report,
values_from = c(pts,
p1, p2, p3,p4,p5,
d1,d2,d3,d4,d5)
)
df_return <- dfa %>%
pivot_longer(cols = !id,
names_to = c('vars','report'),
names_pattern = "([a-z0-9]+)_(.*)",
values_drop_na = TRUE) %>%
pivot_wider(id_cols = c(id, report),
names_from = vars,
values_from = value)
Data
structure(list(pts = c(431L, 167L, 167L, 760L, 348L, 768L, 619L,
169L, 416L, 155L, 47L, 37L, 6L, 17L, 22L, 1L, 149L, 3L, 284L,
7L), d1 = c(129L, 48L, 52L, 166L, 90L, 178L, 184L, 20L, 158L,
42L, 3L, 15L, 2L, 7L, 9L, 0L, 54L, 1L, 69L, 6L), d2 = c(172L,
67L, 64L, 257L, 132L, 255L, 261L, 30L, 201L, 61L, 9L, 20L, 2L,
9L, 12L, 0L, 69L, 1L, 123L, 6L), d3 = c(205L, 77L, 73L, 312L,
153L, 307L, 310L, 39L, 235L, 70L, 12L, 21L, 2L, 10L, 12L, 0L,
77L, 2L, 139L, 6L), d4 = c(227L, 81L, 82L, 363L, 177L, 350L,
342L, 52L, 257L, 75L, 15L, 24L, 2L, 12L, 13L, 0L, 86L, 2L, 151L,
6L), d5 = c(248L, 88L, 92L, 414L, 192L, 387L, 374L, 66L, 279L,
86L, 16L, 26L, 2L, 12L, 15L, 0L, 90L, 3L, 164L, 7L), report = c("2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006"), p1 = c(1.0360364394094, 1.22979866735429,
1.21423740998677, 0.87891144382145, 0.810310827130179, 0.965901663505148,
1.02621739486337, 0.69319116444678, 1.18938130906092, 1.04220816515009,
0.683545688193799, 1.05179228560845, 1.51468104603873, 1.15200888955888,
0.948041330809858, 0, 1.23227405154205, 3.11155226007598, 0.908056299174703,
1.57712371536702), p2 = c(0.986884800185635, 1.23066225499351,
1.07336930339221, 0.966734485786667, 0.87421381769247, 0.974775549615439,
1.06274655160121, 0.705150638862953, 1.12934487417415, 1.10234720984265,
1.11084642794988, 1.06558505521222, 1.0197697665798, 1.15605466288868,
1.01469386643771, 0, 1.17689541437029, 1.42783711234222, 1.16124019281912,
1.27756288696848), p3 = c(0.993575954694177, 1.17968893104311,
1.02608313159672, 0.965200422661265, 0.862910478266102, 0.976436243011877,
1.06679768502287, 0.722966824498357, 1.12591016481614, 1.05867627021151,
1.11227024088529, 0.98275117259764, 0.803738347803303, 1.09341228936369,
0.878291424560146, 0, 1.10500006213832, 1.93128861370172, 1.0949534752299,
1.14755029569502), p4 = c(0.986244633210798, 1.08520792731261,
1.01128789684232, 0.977245321880205, 0.89785754450165, 0.981536130349165,
1.04454959427709, 0.807825580390444, 1.1035817255901, 1.00192975678877,
1.14371311954082, 1.02812279984398, 0.66742040677939, 1.15526702119886,
0.878479047328667, 0, 1.10559111180852, 1.4717526513624, 1.05479137550321,
1.07005088091939), p5 = c(0.992583778223324, 1.06016737802091,
1.02253158347207, 1.00026491073882, 0.896290873874826, 0.985549150023704,
1.04187931404895, 0.886647217836043, 1.09837506943384, 1.0323002052873,
1.05833769015682, 1.05042831618603, 0.592515872759586, 1.05106420250504,
0.961672664191663, 0, 1.05868657273466, 1.81304485775152, 1.04168095802127,
1.19437925124365), id = c("ID 1", "ID 2", "ID 3", "ID 4", "ID 5",
"ID 6", "ID 7", "ID 8", "ID 9", "ID 10", "ID 11", "ID 12", "ID 13",
"ID 14", "ID 15", "ID 16", "ID 17", "ID 18", "ID 19", "ID 20"
)), row.names = c(NA, 20L), class = "data.frame")
We may need the .value in the names_to, which selects the prefix part of the column name before the _ as the column value and the 'report' will return the suffix column name
library(tidyr)
pivot_longer(dfa, cols = -id, names_to = c(".value", "report"),
names_sep = "_")
-output
# A tibble: 20 × 13
id report pts p1 p2 p3 p4 p5 d1 d2 d3 d4 d5
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID 1 2006 431 1.04 0.987 0.994 0.986 0.993 129 172 205 227 248
2 ID 2 2006 167 1.23 1.23 1.18 1.09 1.06 48 67 77 81 88
3 ID 3 2006 167 1.21 1.07 1.03 1.01 1.02 52 64 73 82 92
4 ID 4 2006 760 0.879 0.967 0.965 0.977 1.00 166 257 312 363 414
5 ID 5 2006 348 0.810 0.874 0.863 0.898 0.896 90 132 153 177 192
6 ID 6 2006 768 0.966 0.975 0.976 0.982 0.986 178 255 307 350 387
7 ID 7 2006 619 1.03 1.06 1.07 1.04 1.04 184 261 310 342 374
8 ID 8 2006 169 0.693 0.705 0.723 0.808 0.887 20 30 39 52 66
9 ID 9 2006 416 1.19 1.13 1.13 1.10 1.10 158 201 235 257 279
10 ID 10 2006 155 1.04 1.10 1.06 1.00 1.03 42 61 70 75 86
11 ID 11 2006 47 0.684 1.11 1.11 1.14 1.06 3 9 12 15 16
12 ID 12 2006 37 1.05 1.07 0.983 1.03 1.05 15 20 21 24 26
13 ID 13 2006 6 1.51 1.02 0.804 0.667 0.593 2 2 2 2 2
14 ID 14 2006 17 1.15 1.16 1.09 1.16 1.05 7 9 10 12 12
15 ID 15 2006 22 0.948 1.01 0.878 0.878 0.962 9 12 12 13 15
16 ID 16 2006 1 0 0 0 0 0 0 0 0 0 0
17 ID 17 2006 149 1.23 1.18 1.11 1.11 1.06 54 69 77 86 90
18 ID 18 2006 3 3.11 1.43 1.93 1.47 1.81 1 1 2 2 3
19 ID 19 2006 284 0.908 1.16 1.09 1.05 1.04 69 123 139 151 164
20 ID 20 2006 7 1.58 1.28 1.15 1.07 1.19 6 6 6 6 7

Need to fill data frame with missing years and value in 2nd column

I have data frames with counts from a series of years, 1970-2020, generated by a subset command from a larger data set, i.e. resulting in two columns "Year" and "Count":
Year Count
1987 8
1989 1
1991 1
1992 4
1995 11
1996 3
1997 7
.
.
.
2019 2
2020 5
There are missing years where Count=0, and I need a procedure to fill these df's with the missing years and Count=0. I have this script that I can't get to work:
library(tidyr)
aug <- subset(mainframe, month==8)
complete(aug, year = 1987:2020, fill = list(Count = 0))
Here's a sample dataframe 'aug':
dput(aug)
structure(list(month = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), .Label = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), class = "factor"), year = structure(1:28, .Label = c("1987",
"1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995",
"1996", "1998", "2000", "2001", "2002", "2003", "2004", "2005",
"2006", "2007", "2008", "2009", "2010", "2011", "2013", "2015",
"2016", "2018", "2020"), class = "factor"), Count = c(4L, 0L,
3L, 3L, 0L, 0L, 1L, 0L, 1L, 1L, 3L, 0L, 0L, 0L, 0L, 2L, 0L, 0L,
0L, 2L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L)), row.names = c(8L, 20L,
32L, 44L, 56L, 68L, 80L, 92L, 104L, 116L, 128L, 140L, 152L, 164L,
176L, 188L, 200L, 212L, 224L, 236L, 248L, 260L, 272L, 284L, 296L,
308L, 320L, 332L), class = "data.frame")
If I get your question correctly, you want to have a complete dataframe containing year 1987 to 2020, but there are some missing years in your aug dataframe, and you want to fill in the missing years with month = "8" and Count = 0.
Here's a tidyverse approach (in your original aug dataframe, your year is factor, so at the end of my solution I also transformed it into factor):
Your dataset
month year Count
8 8 1987 4
20 8 1988 0
32 8 1989 3
44 8 1990 3
56 8 1991 0
68 8 1992 0
80 8 1993 1
92 8 1994 0
104 8 1995 1
116 8 1996 1
128 8 1998 3
140 8 2000 0
152 8 2001 0
164 8 2002 0
176 8 2003 0
188 8 2004 2
200 8 2005 0
212 8 2006 0
224 8 2007 0
236 8 2008 2
248 8 2009 0
260 8 2010 1
272 8 2011 1
284 8 2013 0
296 8 2015 0
308 8 2016 1
320 8 2018 0
332 8 2020 1
Solution
library(tidyverse)
aug %>% mutate(year = as.numeric(as.character(year))) %>%
complete(year = first(year):max(year), # or year = 1987:2020
fill = list(month = "8", Count = 0)) %>%
mutate(year = as.factor(year))
Output
year month Count
1987 8 4
1988 8 0
1989 8 3
1990 8 3
1991 8 0
1992 8 0
1993 8 1
1994 8 0
1995 8 1
1996 8 1
1997 8 0
1998 8 3
1999 8 0
2000 8 0
2001 8 0
2002 8 0
2003 8 0
2004 8 2
2005 8 0
2006 8 0
2007 8 0
2008 8 2
2009 8 0
2010 8 1
2011 8 1
2012 8 0
2013 8 0
2014 8 0
2015 8 0
2016 8 1
2017 8 0
2018 8 0
2019 8 0
2020 8 1

Recorder Dataset in R based on Country

structure(list(Date = c("KW 52 / 2016", "KW 1 / 2017", "KW 2 / 2017",
"KW 3 / 2017"), Sales_AT = c(150L, 169L, 143L, 170L), Sales_CH = c(150L,
169L, 143L, 170L), Sales_GER = c(150L, 169L, 143L, 170L), Sales_HUN = c(134L,
139L, NA, 125L), Sales_JP = c(134L, NA, 142L, 125L), Sales_POL = c(127L,
175L, 150L, 141L), Sales_SWE = c(125L, NA, 159L, 131L), Sales_USA = c(169L,
159L, NA, 132L), difference_AT = c(NA, 19L, -26L, 27L), difference_CH = c(NA,
19L, -26L, 27L), difference_GER = c(NA, 19L, -26L, 27L), difference_HUN = c(NA,
5L, NA, -14L), difference_JP = c(NA, NA, 8L, -17L), difference_POL = c(NA,
48L, -25L, -9L), difference_SWE = c(NA, NA, 34L, -28L), difference_USA = c(NA,
-10L, NA, -27L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-4L))
This is my dataset which looks like this:
A tibble: 4 x 17
Date Sales_AT Sales_CH Sales_GER Sales_HUN Sales_JP Sales_POL Sales_SWE Sales_USA difference_AT difference_CH difference_GER difference_HUN difference_JP difference_POL difference_SWE difference_USA
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 150 134 134 127 125 169 NA NA NA NA NA NA NA NA
2 KW 1 / 2017 169 169 169 139 NA 175 NA 159 19 19 19 5 NA 48 NA -10
3 KW 2 / 2017 143 143 143 NA 142 150 159 NA -26 -26 -26 NA 8 -25 34 NA
4 KW 3 / 2017 170 170 170 125 125 141 131 132 27 27 27 -14 -17 -9 -28 -27
I want to reorder the dataset to have the sales and difference column of each country next to each other.
I´m look for a dplyr solution which works like this, but in a dynamic way:
wide_result %>%
select(contains("AT"), contains("CH"), contains("HUN"), contains("JP"), contains("USA"))
Can anyone help me?
Using base R:
df[c(1, order(sub(".*_", "", names(df)[-1])) + 1)]
Here's a way we can do it. Basically, we put the names of the data into a tibble, extract the part of the name after the _ (when possible), and then sort by that extracted text.
names_sort <- tibble(nn = names(dat)) %>%
filter(nn != "Date") %>% # remove Date column, since we'll select that first
# replace everything before and up to _ with ""
mutate(names_fix = gsub(".*_", "", nn)) %>%
arrange(names_fix) %>%
pull(nn)
dat %>%
select(Date, names_sort)
# Date Sales_AT difference_AT Sales_CH difference_CH
# <chr> <int> <int> <int> <int>
# 1 KW 52 / 2016 150 NA 150 NA
# 2 KW 1 / 2017 169 19 169 19
# 3 KW 2 / 2017 143 -26 143 -26
# 4 KW 3 / 2017 170 27 170 27
You can use dplyr select_at:
vars <- c("CH", "AT")
df %>%
select_at(vars(one_of("Date",
paste0("Sales_", vars),
paste0("difference_", vars))))
# A tibble: 4 x 5
Date Sales_CH Sales_AT difference_CH difference_AT
<chr> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 NA NA
2 KW 1 / 2017 169 169 19 19
3 KW 2 / 2017 143 143 -26 -26
4 KW 3 / 2017 170 170 27 27

filling in missing data using fitted value in R

I have a dataframe like this:
ID year age wage
1 2 1981 22 10000
2 2 1982 23 11000
3 2 1983 24 11500
4 2 1984 25 11000
5 2 1985 26 14000
6 2 1986 27 16000
7 2 1987 28 20000
8 2 1988 29 19000
9 2 1989 30 20000
10 2 1990 31 20000
11 2 1991 32 22000
12 2 1992 33 25000
13 2 1993 34 0
14 2 1994 35 NA
15 2 1995 36 0
16 2 1996 37 NA
17 2 1997 38 0
18 2 1998 39 NA
19 2 1999 40 0
20 2 2000 41 NA
21 2 2001 42 0
22 2 2002 43 NA
23 2 2003 44 0
24 2 2004 45 NA
25 2 2005 46 5500
26 2 2006 47 NA
27 2 2007 48 5000
28 2 2008 49 NA
29 2 2009 50 6000
30 2 2010 51 NA
31 2 2011 52 19000
32 2 2012 53 NA
33 2 2013 54 21000
34 2 2014 55 NA
35 2 2015 56 23000
36 3 1984 22 1300
37 3 1985 23 0
38 3 1986 24 1500
39 3 1987 25 1000
40 3 1988 26 0
I want to use an individual-specific regression of wage on age and age-squared to impute missing wage observations. I want to only impute when at least 5 non-missing observations are available.
As suggested by jay.sf, I tried the following but with fitted values:
df_imp <- do.call(rbind,
by(df, df$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$fitted.values
x$wage[IDs] <- with(x, b)[IDs]
}
return(x)
}))
I got the following results:
ID year age wage
36 2 1981 22 10000.000
37 2 1982 23 11000.000
38 2 1983 24 11500.000
39 2 1984 25 11000.000
40 2 1985 26 14000.000
41 2 1986 27 16000.000
42 2 1987 28 20000.000
43 2 1988 29 19000.000
44 2 1989 30 20000.000
45 2 1990 31 20000.000
46 2 1991 32 22000.000
47 2 1992 33 25000.000
48 2 1993 34 0.000
49 2 1994 35 7291.777
50 2 1995 36 0.000
51 2 1996 37 6779.133
52 2 1997 38 0.000
53 2 1998 39 7591.597
54 2 1999 40 0.000
55 2 2000 41 9729.168
56 2 2001 42 0.000
57 2 2002 43 13191.847
58 2 2003 44 0.000
59 2 2004 45 17979.633
60 2 2005 46 5500.000
61 2 2006 47 NA
62 2 2007 48 5000.000
63 2 2008 49 NA
64 2 2009 50 6000.000
65 2 2010 51 NA
66 2 2011 52 19000.000
67 2 2012 53 NA
68 2 2013 54 21000.000
69 2 2014 55 NA
70 2 2015 56 23000.000
You could use a simple if statement, without an else. Define an ID vector IDs that identifies missings, which you use to count them and to subset your Y column wage.
For this you can use by(), which splits your data similar to split() but you may apply a function; just rbind the result.
It's probably wiser to rather use the coefficients than the fitted values, because the latter also would be NA if your Y are NA. And you need to use raw=TRUE in the poly.
DF.imp <- do.call(rbind,
by(DF, DF$ID, function(x) {
IDs <- which(is.na(x$wage))
if (length(x$wage[- IDs]) >= 5) {
b <- lm(wage ~ poly(age, 2, raw=TRUE), x)$coefficients
x$wage[IDs] <- with(x, (b[1] + b[2]*age + b[3]*age^2))[IDs]
}
return(x)
}))
Note that I've slightly changed your example data, so that ID 3 also has missings, but less than 5 non-missings.
Result
DF.imp
# ID year age wage
# 2.1 2 1981 22 10000.000
# 2.2 2 1982 23 11000.000
# 2.3 2 1983 24 11500.000
# 2.4 2 1984 25 11000.000
# 2.5 2 1985 26 14000.000
# 2.6 2 1986 27 16000.000
# 2.7 2 1987 28 20000.000
# 2.8 2 1988 29 19000.000
# 2.9 2 1989 30 20000.000
# 2.10 2 1990 31 20000.000
# 2.11 2 1991 32 22000.000
# 2.12 2 1992 33 25000.000
# 2.13 2 1993 34 0.000
# 2.14 2 1994 35 7626.986
# 2.15 2 1995 36 0.000
# 2.16 2 1996 37 7039.387
# 2.17 2 1997 38 0.000
# 2.18 2 1998 39 6783.065
# 2.19 2 1999 40 0.000
# 2.20 2 2000 41 6858.020
# 2.21 2 2001 42 0.000
# 2.22 2 2002 43 7264.252
# 2.23 2 2003 44 0.000
# 2.24 2 2004 45 8001.761
# 2.25 2 2005 46 5500.000
# 2.26 2 2006 47 9070.546
# 2.27 2 2007 48 5000.000
# 2.28 2 2008 49 10470.609
# 2.29 2 2009 50 6000.000
# 2.30 2 2010 51 12201.948
# 2.31 2 2011 52 19000.000
# 2.32 2 2012 53 14264.565
# 2.33 2 2013 54 21000.000
# 2.34 2 2014 55 16658.458
# 2.35 2 2015 56 23000.000
# 3.36 3 1984 22 1300.000
# 3.37 3 1985 23 NA
# 3.38 3 1986 24 1500.000
# 3.39 3 1987 25 1000.000
# 3.40 3 1988 26 NA
Data
DF <- structure(list(ID = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), year = c(1981L,
1982L, 1983L, 1984L, 1985L, 1986L, 1987L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 1984L, 1985L,
1986L, 1987L, 1988L), age = c(22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 56L, 22L, 23L, 24L, 25L, 26L), wage = c(10000L, 11000L,
11500L, 11000L, 14000L, 16000L, 20000L, 19000L, 20000L, 20000L,
22000L, 25000L, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA,
5500L, NA, 5000L, NA, 6000L, NA, 19000L, NA, 21000L, NA, 23000L,
1300L, NA, 1500L, 1000L, NA)), row.names = c(NA, -40L), class = "data.frame")

R programming - data frame manoevur

Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")

Resources