Back transform dummy columns into one variable - r

Example:
my_diamonds <- diamonds %>% fastDummies::dummy_cols(select_columns = "color", remove_selected_columns = T)
my_diamonds %>% glimpse
Looks like this:
Observations: 53,940
Variables: 16
$ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.30, 0.23, 0.22, 0.31, 0.20, 0.32, 0.30, 0.30, 0.30, 0.30, 0.30, 0.23, 0.2…
$ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Very Good, Fair, Very Good, Good, Ideal, Premium, Ideal, Premium, Premium, I…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, SI1, SI2, SI2, I1, SI2, SI1, SI1, SI1, SI2, VS2, VS1, SI1, SI1, VVS2, VS1…
$ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64.0, 62.8, 60.4, 62.2, 60.2, 60.9, 62.0, 63.4, 63.8, 62.7, 63.3, 63.8, 61.…
$ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58, 54, 54, 56, 59, 56, 55, 57, 62, 62, 58, 57, 57, 61, 57, 57, 57, 59, 58,…
$ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 342, 344, 345, 345, 348, 351, 351, 351, 351, 352, 353, 353, 353, 354, 355, …
$ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.25, 3.93, 3.88, 4.35, 3.79, 4.38, 4.31, 4.23, 4.23, 4.21, 4.26, 3.85, 3.9…
$ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.28, 3.90, 3.84, 4.37, 3.75, 4.42, 4.34, 4.29, 4.26, 4.27, 4.30, 3.92, 3.9…
$ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.73, 2.46, 2.33, 2.71, 2.27, 2.68, 2.68, 2.70, 2.71, 2.66, 2.71, 2.48, 2.4…
$ color_D <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, …
$ color_E <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_F <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_G <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_H <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
$ color_I <int> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
$ color_J <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
Is there an out of the box, non custom function way to get my_diamonds back into it's original form with a single column for 'color'?

You can use pivot_longer :
library(dplyr)
tidyr::pivot_longer(my_diamonds, cols = starts_with('color'),
names_to = 'color',
names_pattern = '.*_(.*)') %>%
filter(value == 1) %>%
select(-value)
# A tibble: 53,940 x 10
# carat cut clarity depth table price x y z color
# <dbl> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>
# 1 0.23 Ideal SI2 61.5 55 326 3.95 3.98 2.43 E
# 2 0.21 Premium SI1 59.8 61 326 3.89 3.84 2.31 E
# 3 0.23 Good VS1 56.9 65 327 4.05 4.07 2.31 E
# 4 0.290 Premium VS2 62.4 58 334 4.2 4.23 2.63 I
# 5 0.31 Good SI2 63.3 58 335 4.34 4.35 2.75 J
# 6 0.24 Very Good VVS2 62.8 57 336 3.94 3.96 2.48 J
# 7 0.24 Very Good VVS1 62.3 57 336 3.95 3.98 2.47 I
# 8 0.26 Very Good SI1 61.9 55 337 4.07 4.11 2.53 H
# 9 0.22 Fair VS2 65.1 61 337 3.87 3.78 2.49 E
#10 0.23 Very Good VS1 59.4 61 338 4 4.05 2.39 H
# … with 53,930 more rows

Another option using max.col:
col <- "color"
my_diamonds$color <- my_diamonds %>%
select(starts_with(col)) %>%
{gsub(paste0(col,"_"), "", names(.))[max.col(.)]}

A base R option:
cols <- sub("color_", "", grep("^color_", names(my_diamonds), value=TRUE)); cols
[1] "D" "E" "F" "G" "H" "I" "J"
my_diamonds$color <- cols[
apply(my_diamonds[,grep("^color_", names(my_diamonds))], 1, which.max]
all(my_diamonds$color==diamonds$color)
#[1] TRUE
Or using the much quicker max.col (thanks #chinsoon12):
my_diamonds$color <- cols[max.col(my_diamonds[,grep("^color_", names(my_diamonds))])]
all(my_diamonds$color == diamonds$color)
#[1] TRUE

Related

How to insert rowname if var name begins with certain string in R

Suppose that I have the following df:
library(dplyr)
glimpse(mydf)
Rows: 3,286
Columns: 741
$ acc_180d_gdd_diff_GO_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MG_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MS_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MT_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_PR_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_RS_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_US_IA_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_US_IL_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ gdd_90d_precip_GO_biz <dbl> 22.53283, 29.86229, 31.01503, 38…
$ gdd_90d_precip_MG_biz <dbl> 10.96422, 11.70888, 14.23489, 19…
$ gdd_90d_precip_MS_biz <dbl> 6.976152, 27.515620, 27.758262, …
$ gdd_90d_precip_MT_biz <dbl> 26.55052, 26.55052, 64.79208, 67…
...
I´m trying to insert a new column called Group_Var that show me based on the var name if is on climatic group, price group, and so on.
For example, how can I insert "climatic" using mutate on the new column Group_Var if the var names in mydf starts with "acc" ?
My desired df is something like...
library(dplyr)
glimpse(mydf)
Rows: 3,286
Columns: 741
$ acc_180d_gdd_diff_GO_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MG_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MS_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_MT_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_PR_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_RS_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_US_IA_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ acc_180d_gdd_diff_US_IL_biz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Group_Var <chr> "Climatic","Climatic","Climatic",...
$ gdd_90d_precip_GO_biz <dbl> 22.53283, 29.86229, 31.01503, 38…
$ gdd_90d_precip_MG_biz <dbl> 10.96422, 11.70888, 14.23489, 19…
$ gdd_90d_precip_MS_biz <dbl> 6.976152, 27.515620, 27.758262, …
$ gdd_90d_precip_MT_biz <dbl> 26.55052, 26.55052, 64.79208, 67…

R list within a dataframe calculation

I have the following dataframe df:
tile_type_index
71 17
81 8
71.1 17
81.1 8
71.2 17
71.3 17
material_balance
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
71.1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
81.1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
71.2 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
71.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
material_spend
71 0.3333333
81 0.3333333
71.1 0.3333333
81.1 0.3333333
71.2 0.3333333
71.3 0.3333333
df<-structure(list(tile_type_index = c(17L, 8L, 17L, 8L, 17L, 17L
), material_balance = list(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0)), material_spend = c(0.333333333333333,
0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333,
0.333333333333333)), row.names = c("71", "81", "71.1", "81.1",
"71.2", "71.3"), class = "data.frame"
For each row of df, I want to add material_spend to the element of material_balance that has the index given tile_type_index. So I want to do something like material_balance[tile_type_index]<-material_spend but I'm not sure how to do this.
The result should look like the following:
tile_type_index
71 17
81 8
71.1 17
81.1 8
71.2 17
71.3 17
material_balance
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0
81 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
71.1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0
81.1 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
71.2 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0
71.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.33, 0, 0, 0
material_spend
71 0.3333333
81 0.3333333
71.1 0.3333333
81.1 0.3333333
71.2 0.3333333
71.3 0.3333333
You can use dplyr for this. Normally dplyr assumes that you are operating on entire columns at once, but this is a special row-wise operation so we use the rowwise() verb. Then we can also use the replace function to replace part of a vector and return the updated value. Because you have a list column, we need to wrap the result in a list so the resulting value has a length of 1. So this should work
df %>%
rowwise() %>%
mutate(material_balance = list(replace(material_balance, tile_type_index, material_spend)))
in base R you could do:
df[[2]] <- do.call(Map, c(`[<-`, df[c(2,1,3)]))
or
df[[2]] <- Map(`[[<-`, df$material_balance, df$tile_type_index,df$material_spend)

Minus values from columns relative to year

I'm trying to minus values for each habitat covariate relative to year 2019 and 2010. So, something that can assign by ID those values belonging to each habitat for 2010 and 2019, minus them, otherwise, those that aren't grouped by ID are left as is in the dataframe.
Here's an example of the dataset and what I expect for the output:
#dataset example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 2010 0.1 0 0 0 0 0 0
8 492 2019 0.1 0 0 0 0 0 0
9 719 2010 0.0769 0 0 0 0 0 0
10 719 2019 0.0769 0 0 0 0 0 0
#output example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 changed 0 0 0 0 0 0 0
9 719 changed 0 0 0 0 0 0 0
I can imagine this working with a function and boolean operators such that, if year 2010 & 2019 match by id then minus the next row by the previous (assuming that they're ordered by id then this should work), otherwise, if they do not match by id then leave them as is.
I'm trying to wrap my head around which code to use for this, I can see this working within a function and using lapply to apply across the entire dataset.
Here's a reproducible code:
structure(list(id = c(267L, 268L, 362L, 420L, 421L, 484L, 492L,
492L, 719L, 719L, 986L, 986L, 1071L, 1071L, 1303L, 1303L, 1306L,
1399L, 1399L, 1400L, 1400L, 2007L, 2083L, 2083L, 2134L, 2135L,
2136L, 2213L, 2213L, 2214L), year = c(2019, 2019, 2019, 2019,
2019, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010,
2019, 2010, 2010, 2019, 2010, 2019, 2019, 2010, 2019, 2019, 2019,
2019, 2010, 2019, 2010), pland_00_water = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_01_evergreen_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0588235294117647, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_02_evergreen_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_03_deciduous_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0, 0,
0, 0, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_04_deciduous_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0.0714285714285714,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_05_mixed_forest = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_06_closed_shrubland = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), pland_07_open_shrubland = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), pland_08_woody_savanna = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_09_savanna = c(0,
0, 0, 0, 0, 0, 0, 0, 0.0769230769230769, 0.0769230769230769,
0.0588235294117647, 0.0588235294117647, 0, 0, 0, 0.0769230769230769,
0.0588235294117647, 0.05, 0.05, 0.111111111111111, 0.111111111111111,
0, 0, 0, 0, 0, 0, 0, 0, 0), pland_10_grassland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_11_wetland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0, 0, 0.1, 0.1, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.0588235294117647, 0.0714285714285714,
0.0714285714285714, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0, 0.0526315789473684, 0.142857142857143,
0.142857142857143, 0.0666666666666667, 0.0588235294117647, 0.1,
0.142857142857143, 0.142857142857143, 0), pland_12_cropland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0, 0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0.111111111111111, 0.0526315789473684,
0.142857142857143, 0.142857142857143, 0.0666666666666667, 0,
0, 0.142857142857143, 0.142857142857143, 0.25), pland_13_urban = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_14_mosiac = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0, 0.0588235294117647,
0, 0, 0, 0, 0, 0.05, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
pland_15_barren = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Here's a tidyverse version:
library(dplyr)
x %>%
arrange(year) %>%
# can add 'id' if desired, minimum 'year' required for below
group_by(id) %>%
filter(
all(c("2010", "2019") %in% year),
year %in% c("2010", "2019")
) %>%
summarize_at(vars(-year), diff) %>%
mutate(year = "changed") %>%
ungroup() %>%
bind_rows(x, .) %>%
arrange(id, year) # just to show id=492
# # A tibble: 39 x 18
# id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 267 2019 0.0833 0 0 0 0 0
# 2 268 2019 0.2 0 0 0 0 0
# 3 362 2019 0.1 0 0 0 0 0
# 4 420 2019 0.0556 0 0 0 0 0
# 5 421 2019 0.0667 0 0 0 0 0
# 6 484 2019 0.125 0 0 0 0 0
# 7 492 2010 0.1 0 0 0 0 0
# 8 492 2019 0.1 0 0 0 0 0
# 9 492 chan~ 0 0 0 0 0 0
# 10 719 2010 0.0769 0 0 0 0 0
# # ... with 29 more rows, and 10 more variables: pland_06_closed_shrubland <dbl>, pland_07_open_shrubland <dbl>,
# # pland_08_woody_savanna <dbl>, pland_09_savanna <dbl>, pland_10_grassland <dbl>, pland_11_wetland <dbl>,
# # pland_12_cropland <dbl>, pland_13_urban <dbl>, pland_14_mosiac <dbl>, pland_15_barren <dbl>
Explanation:
the first arrange(year) is so that the diff later will have values in an expected order (assuming all years are year-like that sort lexicographically the same as a numerical sort);
the filter first removes any ids that do not have both years, and then ensures we have only those two years; while your data only contains "2010" and "2019", I didn't want to assume that ... it's a harmless filter if that's all you have, remove year %in% c("2010","2019") if desired and safe;
I assume that columns other than id and year are numeric/integer, so summarize_at(vars(-year), diff) is safe (id is out of the picture since it is a grouping variable); if there are non-numerical values, you might be able to use summarize_if(is.numeric, diff) which also works here ... but will silently NA-ize non-numeric fields if present;
bind_rows(x, .) is needed because the filter removed many rows we want/need to retain; and
the last arrange(id,year) is solely demonstrative for this answer.

reshape daily timeseries dataset with days as variable

In this timeseries data frame below, the day of month is a variable. I would like to reshape this dataset from wide to long but keep the right date format.
structure(list(Year = c(1994, 1995, 1996, 1997, 1998, 1999, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 1994, 1995,
1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016),
Month = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), day1 = c(0,
0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 7.4, 0, 0, 28.2, 0,
0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.4,
18.6, 0, 0, 0, 56, 2, 0, 0.4, 0, 0, 0, 0, 0), day2 = c(0,
0, 0, 0, 8.4, 0, 0, 0, 65.2, 0, 0, 0, 0, 0, 0, 0, 41, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5.2, 0, 0, 0, 0, 0, 0, 0,
6.8, 0, 0, 0, 0, 10.6, 0, 9.2, 0, 0, 0, 0, 21.6), day3 = c(0,
0, 0, 0, 0, 0, 0, 0, 132.4, 0, 0, 0, 0, 0, 0, 0, 0, 1.2,
0, 10.2, 0, 0, 1.6, 0, 0, 0, 0, 0, 0, 7.4, 0, 0, 0, 5.2,
7.8, 0, 2.6, 43.4, 0, 0, 0, 0, 2.6, 0, 0, 0, 0, 0, 0, 6.2
), day4 = c(0, 0, 0, 0, 0, 0, 15.6, 0, 34.6, 0, 0, 0, 0,
0, 0, 0, 81, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 13.1, 0, 0, 0,
0, 0, 0, 53.2, 4, 0, 2.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6.8,
0), day5 = c(0, 0, 0, 0, 0, 0, 12.4, 0, 1.2, 0, 0, 0, 0,
21, 0, 0, 5, 1, 0, 0, 0, 47, 0, 0, 0, 0, 9.2, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 10.2, 0, 3, 0, 0, 0.6, 0, 0, 0, 0, 0, 11.4,
0), day6 = c(8.6, 0, 0, 0, 0, 0, 17.2, 0, 9.4, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 5.4, 30.5, 61, 0,
0, 0, 0, 0, 0, 0, 0, 11.4, 0, 5.7, 0, 0, 5.8, 0, 0, 0, 0,
0, 0, 0), day7 = c(0, 0, 8.4, 0, 0, 0, 42, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5.2, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.2, 0, 0.8, 0, 0, 0, 0, 0, 0, 0, 7, 0,
0), day8 = c(2, 0, 0, 3, 0, 0, 26.4, 0, 12.8, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 1.8, 0, 0, 5.8, 13.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), day9 = c(0, 0, 0, 0, 0, 0, 17.2, 0, 7.6, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 23, 0, 0, 0, 12,
0, 0, 72.6, 0, 0, 0, 0, 0, 0, 0, 3, 0, 6.6, 0, 0, 0, 19.4,
0, 0), day10 = c(0, 0, 0, 0, 0, 8.2, 10.8, 0, 0, 0, 2.2,
0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 2.5,
0, 19.4, 0, 2.4, 0, 0, 2.4, 0, 0, 0, 0, 0, 0, 0, 0.2, 0,
0, 1.4, 0, 0, 0.4), day11 = c(0, 0, 0, 0, 1.6, 64, 0, 0,
1.6, 0, 29, 0, 0, 0, 0, 0, 16.2, 12.8, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 27.5, 0, 0, 0, 0, 1.4, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 17.6, 0, 0), day12 = c(0, 0, 0, 0, 0, 0, 0,
0, 22.4, 0, 41.6, 0, 0, 2, 0, 0, 0.2, 17.6, 0, 0, 0, 0, 0,
0, 5.6, 0, 0, 0, 0, 23, 0, 0, 3.6, 0, 1.8, 1.2, 14.6, 0,
81.8, 0, 1.4, 4.4, 33, 2.4, 0, 0, 0, 1.6, 0, 0), day13 = c(0,
0, 3, 3.2, 0, 0, 0, 4.2, 0, 0, 6, 0, 0, 2.4, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 5.8, 0, 0, 0, 0, 54.2, 0, 0, 6.2,
16.4, 10, 0, 0, 6.6, 0, 101.2, 0, 0, 0, 0, 0, 0, 0), day14 = c(0,
0, 0, 9, 12.2, 0, 0, 0, 2.6, 0, 26.4, 60.6, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 75, 9, 0, 0, 6.8, 0, 6.4, 0,
7.8, 0, 0, 0, 0, 16.2, 0, 6, 0, 50, 0, 0, 1.4, 0, 0), day15 = c(0,
0, 0, 0, 0, 0, 0, 0, 11.2, 0, 8.6, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 2.8, 0, 2.2, 0, 0, 6.2, 0, 0, 0, 0, 0, 4.2, 0, 0,
0, 0, 0, 0, 50.8, 0, 0, 0.4, 21.8, 0, 23, 0, 0, 0), day16 = c(0,
0, 0, 0, 0, 0, 11.2, 0, 3.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33.4, 0, 0, 0,
0, 16.6, 0.6, 0.6, 0, 0, 0, 3.4, 21.6, 0, 0, 0, 0), day17 = c(0,
0, 0, 0, 0, 0, 0, 0, 10.4, 0, 0, 0, 0, 0, 0, 11.2, 0, 0,
0, 14.2, 0, 0, 0, 0, 0, 0, 0, 1.5, 11, 0, 0, 0, 0, 1.2, 0,
0, 0, 0, 1, 1, 20.6, 0, 0, 0, 22.2, 2.6, 0, 2.4, 0, 0), day18 = c(60.6,
0, 0, 0, 0, 0, 0, 0, 28.8, 0, 0.4, 0, 0, 0, 0, 0, 1.2, 0,
0, 0, 0, 0, 9, 0, 0, 5.4, 1.4, 0, 0, 0, 0, 59.6, 11.8, 5.6,
0, 0, 0, 0, 0, 42, 26, 0, 0, 0, 0, 12, 17.8, 1.2, 0, 0),
day19 = c(30, 0, 9.8, 0, 1.2, 0, 0, 0, 1.6, 17.2, 50.6, 0,
0, 0, 0, 0, 16.2, 0, 4.2, 0, 0, 0, 13.4, 0, 1.4, 0, 0, 3.2,
0, 0, 0, 1.2, 32, 0, 0, 0, 0, 0, 0, 29.8, 19.6, 0, 0, 0,
0, 6.4, 1, 0, 1, 0), day20 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
43.4, 2, 4.4, 0, 0, 0, 0, 4.8, 10, 18.8, 0, 7, 0, 1.6, 0,
46, 0, 0, 70, 5, 0, 16.2, 0, 0, 0, 0, 0, 15.2, 0, 0, 0, 18.4,
0, 21, 0, 2, 60, 0, 0, 5.6, 0), day21 = c(0, 0, 2, 0, 1.8,
47, 0, 0, 0, 22.8, 7.4, 0, 0, 0, 0, 0, 35, 11.4, 0, 6, 0,
0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1.4, 0, 46, 8.4, 0, 0, 0, 0), day22 = c(72, 0, 0, 23,
0, 0, 0, 0, 31.6, 1.6, 15.4, 0, 0, 0, 0, 10.6, 0.6, 12.8,
3, 0, 0, 0, 16, 0, 0, 0, 18.2, 4, 0, 0, 6.4, 0, 0, 1.2, 0,
0, 9.8, 0, 0, 0, 2.2, 0, 12.2, 0, 1, 0, 0, 0, 1.4, 0), day23 = c(1.2,
0, 0, 10, 0, 0, 0, 0, 3.4, 0, 0, 0, 0, 0, 10, 37, 0, 39,
2, 0, 0, 0, 6.2, 19.2, 0, 7.6, 0, 0, 0, 0, 0, 0, 2.4, 0.6,
0, 0, 4.2, 0, 0, 32, 15, 0, 6.8, 0, 0, 0, 0, 0, 18.6, 0),
day24 = c(0, 0, 0, 4.2, 0, 0, 0, 0, 0, 8.4, 14.8, 1.2, 0,
0, 8.4, 20.4, 0, 17, 0, 0, 0, 0, 30.8, 0, 9, 0, 21.6, 0,
0, 25.4, 0, 0, 0, 8.6, 0, 0, 41.4, 0, 0, 6.4, 20.8, 21.6,
22.6, 23.6, 0.8, 4, 0, 0, 0, 4.6), day25 = c(0, 0, 0, 0,
0, 0, 0, 0, 1, 9.2, 32, 0, 0, 0, 0, 0, 0, 2.4, 16, 0, 0,
0, 4, 0, 1.6, 0, 0, 0, 0, 26, 0, 0, 0, 4.2, 0, 0, 1.8, 6,
0, 25.2, 10.2, 0, 0, 0.4, 0, 0, 0, 0, 0, 0), day26 = c(0,
0, 0, 44, 0, 0, 0, 0, 0, 0, 56.6, 0.6, 0, 0, 2, 0, 0, 11.2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13, 0, 4.6, 4.4, 26.6, 0, 0, 54.4, 0, 0, 0, 0, 0), day27 = c(0,
0, 0, 0, 10.6, 0, 0, 0, 0, 22.6, 45.4, 0, 0, 0, 15.4, 0,
2.6, 0.4, 0, 0, 0, 0, 0, 0, 2.4, 0, 0, 0, 0, 0, 0, 0, 0,
3.4, 0, 0, 16.8, 14.2, 0, 8.8, 0, 0, 1.8, 0, 0, 4.8, 0, 0,
0, 0), day28 = c(0, 0, 0, 7.4, 0, 0, 0, 6.2, 0, 39.4, 39.2,
0, 0, 0, 0, 0, 0, 8.6, 0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0,
0, 0, 2.4, 0, 0, 2.8, 0, 7.2, 23.2, 0, 0, 0, 2.4, 0.2, 0,
0, 0, 0, 9.6, 0, 0), day29 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.8, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA,
NA, 0, NA, NA, NA, 0.8, NA, NA, NA, 0, NA, NA, NA, 0, NA,
NA, NA, 0, NA, NA, NA, 0), day30 = c(0, 0, 0, 26.6, 0, 0,
0, 0, 0, 0, 8.2, 0, 0, 0, 0, 1.4, 0, 0.6, 12.2, 0, 4.8, 0,
0, 0, 0.6, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), day31 = c(0,
102, 0, 0, 2.4, 0, 0, 0, 2.4, 0, 47, 0, 0, 0, 25, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, 50L), class = "data.frame")
If I use melt function it will not consider the number of the days in the month putting NA values in the wrong dates such as 1994-02-30. I could remove the rows with NA values but I will need to be sure that my dataset there's no any NA value.
melt(Data,c("Year","Month")))
My desired output would be like this:
Data<-
Date Value
1994-01-01 0.1
1994-01-02 0
1994-01-03 12
You can get the data in long format, extract the data from column names, combine year, month and date values to create an actual date.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with('day'),
values_drop_na = TRUE) %>%
mutate(name = readr::parse_number(name)) %>%
unite(Date, Year, Month, name, sep = '-') %>%
mutate(Date = as.Date(Date))
# A tibble: 1,487 x 2
# Date value
# <date> <dbl>
# 1 1994-01-01 0
# 2 1994-01-02 0
# 3 1994-01-03 0
# 4 1994-01-04 0
# 5 1994-01-05 0
# 6 1994-01-06 8.6
# 7 1994-01-07 0
# 8 1994-01-08 2
# 9 1994-01-09 0
#10 1994-01-10 0
# … with 1,477 more rows

Why do I get strings instead of integers when I scrape an HTML table in R?

I am having a difficult time scraping data tables from [iea.org][1]. I use the following code :
library("rvest")
url <- "http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013"
energy <- url %>%
html() %>%
html_nodes(xpath='//*[#id="stats-container"]/div[2]/table') %>%
html_table()
head(energy)
Instead of having numbers in the cells of the table, the resulting table in R only contains letters.
Thanks for the help in advance.
Until proven otherwise (or the site owners read up on how to use robots.txt and find a real lawyer to craft more explicit & restrictive T&Cs)…
I'll start with a non-"tidyverse" solution for this answer:
library(rvest)
x <- read_html("http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013")
# find the table; note that a less "structural" selector will generally make
# scraping code a bit less fragile.
xdf <- html_node(x, xpath=".//table[contains(., 'International marine')]")
xdf <- html_table(xdf)
# clean up column names
xdf <- janitor::clean_names(xdf)
Now, the columns are encoded as noted by the OP and in the question comment discussions:
xdf$oil_products
## [1] "MA==" "Mzkx" "LTUw" "MA==" "LTUy" "MA==" "Mjkw" "MA==" "MQ==" "LTEw"
## [11] "MA==" "MA==" "MA==" "NjAx" "MA==" "MA==" "MA==" "LTE1" "MA==" "ODY2"
## [21] "MzQ2" "MzMy" "MTI0" "Nw==" "NDI=" "MjY=" "MA==" "NTA=" "NjM=" "MA=="
The == gives it away as base64 encoded (though the URL mentioned in the comments further confirms this). They encoded each character so we need to convert them from b64 first then convert to numeric:
# decode each column
lapply(xdf[2:12], function(.x) {
as.numeric(
sapply(.x, function(.y) {
rawToChar(openssl::base64_decode(.y))
}, USE.NAMES=FALSE)
)
}) -> xdf[2:12]
A quick str() alternative view:
tibble::glimpse(xdf)
## Observations: 30
## Variables: 12
## $ x <chr> "Production", "Imports", "Exports", "International marine bunkers***", "International aviation bunkers***", "Stock c...
## $ coal <dbl> 88, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ crude_oil <dbl> 0, 618, 0, 0, 0, 21, 639, 0, 0, 0, 0, 0, 0, -639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ oil_products <dbl> 0, 391, -50, 0, -52, 0, 290, 0, 1, -10, 0, 0, 0, 601, 0, 0, 0, -15, 0, 866, 346, 332, 124, 7, 42, 26, 0, 50, 63, 0
## $ natural_gas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ nuclear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ hydro <dbl> 1142, 0, 0, 0, 0, 0, 1142, 0, 0, -1142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ geothermal_solar_etc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ biofuels_and_waste <dbl> 7579, 0, 0, 0, 0, 0, 7579, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1661, 0, 0, 5918, 1479, 0, 4438, 4438, 0, 0, 0, 0, 0, 0
## $ electricity <dbl> 0, 6, -93, 0, 0, 0, -87, 0, 0, 1144, 0, 0, 0, 0, 0, 0, 0, -26, -98, 933, 549, 2, 382, 289, 59, 23, 0, 10, 0, 0
## $ heat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ total <dbl> 8809, 1016, -143, 0, -52, 21, 9651, 0, 1, -9, 0, 0, 0, -39, 0, 0, -1661, -41, -98, 7805, 2462, 335, 4945, 4734, 101,...
And an enhanced print:
tibble::as_tibble(xdf)
## # A tibble: 30 x 12
## x coal crude_oil oil_products natural_gas nuclear hydro geothermal_solar_etc biofuels_and_waste electricity heat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Production 88 0 0 0 0 1142 0 7579 0 0
## 2 Imports 0 618 391 0 0 0 0 0 6 0
## 3 Exports 0 0 -50 0 0 0 0 0 -93 0
## 4 International marine bunkers*** 0 0 0 0 0 0 0 0 0 0
## 5 International aviation bunkers*** 0 0 -52 0 0 0 0 0 0 0
## 6 Stock changes 0 21 0 0 0 0 0 0 0 0
## 7 TPES 88 639 290 0 0 1142 0 7579 -87 0
## 8 Transfers 0 0 0 0 0 0 0 0 0 0
## 9 Statistical differences 0 0 1 0 0 0 0 0 0 0
## 10 Electricity plants 0 0 -10 0 0 -1142 0 0 1144 0
## # ... with 20 more rows, and 1 more variables: total <dbl>
The tidyverse is a bit cleaner:
decode_cols <- function(.x) {
map_dbl(.x, ~{
openssl::base64_decode(.x) %>%
rawToChar() %>%
as.numeric()
})
}
html_node(x, xpath=".//table[contains(., 'International marine')]") %>%
html_table() %>%
janitor::clean_names() %>%
mutate_at(vars(-x), decode_cols)

Resources