Related
I would like to create two new columns based on a third one. These two columns should have incrementing values of two different kinds.
Let´s take the following dataset as an example:
events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))
I would like to create:
a column called "Number" incrementing everytime there is a sequence starting with 0 in the column "Value", and
a column called "Duration" starting from 1 everytime a new sequence of 0s is present in the column "Value" and incrementing with 1 as long as the sequence of 0s continues.
Ideally, the final data frame would be this one:
events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))
I tried to use the tidyverse to do so, but I do not manage to get what I need [I am even very far from it]:
events %>%
mutate(Number = ifelse(Value > 0, NA, 1),
Duration = case_when(Value == 0 & lag(Value, n = 1) != 0 ~ 1,
Value == 0 & lag(Value, n = 1) == 0 ~ 2))
By looking for related questions, I found that this was feasible in SQL [https://stackoverflow.com/questions/42971752/increment-value-based-on-another-column]. I also know that this is quite easy to be done in Excel [the first Value is in the cell B2]:
Number column [Column C]: =IF(B2>0,0,IF(B1=0,C1,MAX(C$1:C1)+1))
Duration column [Column D]: =IF(B2>0,0,IF(B1=0,D1+1,1))
But I need to have it work in R ;-)
Any help is welcome :-)
You can leverage data.table::rleid() twice here to solve the problem
library(data.table)
setDT(events)
events[, Number:=rleid(fifelse(Value==0,1,0))] %>%
.[Value==0,Number:=rleid(Number)] %>%
.[Value!=0,Number:=0] %>%
.[, Duration:=fifelse(Value==0, 1:.N,0), Number] %>%
.[]
Output:
Frame Value Number Duration
1: 1001 2.05 0 0
2: 1002 0.00 1 1
3: 1003 2.26 0 0
4: 1004 2.38 0 0
5: 1005 0.00 2 1
6: 1006 0.00 2 2
7: 1007 2.88 0 0
8: 1008 0.32 0 0
9: 1009 0.85 0 0
10: 1010 2.85 0 0
11: 1011 2.09 0 0
12: 1012 0.00 3 1
13: 1013 0.00 3 2
14: 1014 0.00 3 3
15: 1015 1.11 0 0
16: 1016 0.00 4 1
17: 1017 0.00 4 2
18: 1018 0.00 4 3
19: 1019 2.46 0 0
20: 1020 2.85 0 0
21: 1021 0.00 5 1
22: 1022 0.00 5 2
23: 1023 0.38 0 0
24: 1024 1.91 0 0
25: 1025 0.00 6 1
26: 1026 0.00 6 2
27: 1027 0.00 6 3
28: 1028 2.23 0 0
29: 1029 0.00 7 1
30: 1030 0.48 0 0
31: 1031 1.83 0 0
32: 1032 0.23 0 0
33: 1033 1.49 0 0
Here is a tidyverse solution:
library(tidyverse)
events |>
mutate(Number = replace(cumsum(Value == 0 & lag(Value != 0)), which(Value != 0), 0)) |>
group_by(tmp = cumsum(Value == 0 & lag(Value != 0))) |>
mutate(Duration = replace(row_number(), which(Value != 0), 0)) |>
ungroup() |>
select(-tmp)
#> # A tibble: 33 x 4
#> Frame Value Number Duration
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1001 2.05 0 0
#> 2 1002 0 1 1
#> 3 1003 2.26 0 0
#> 4 1004 2.38 0 0
#> 5 1005 0 2 1
#> 6 1006 0 2 2
#> 7 1007 2.88 0 0
#> 8 1008 0.32 0 0
#> 9 1009 0.85 0 0
#> 10 1010 2.85 0 0
#> # ... with 23 more rows
Here's a dplyr-based solution with a bit of cleverness for the Number column, but still relying on data.table::rleid for the Duration column:
events_final %>%
mutate(
add = Value == 0 & lag(Value) != 0,
Number_result = cumsum(add) * (Value == 0),
rle_0 = data.table::rleid(Value == 0)
) %>%
group_by(rle_0) %>%
mutate(
Duration_result = ifelse(Value == 0, row_number(), 0)
) %>%
ungroup() %>%
select(-add, -rle_0)
# # A tibble: 33 × 6
# Frame Value Number Duration Number_result Duration_result
# <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 1001 2.05 0 0 0 0
# 2 1002 0 1 1 1 1
# 3 1003 2.26 0 0 0 0
# 4 1004 2.38 0 0 0 0
# 5 1005 0 2 1 2 1
# 6 1006 0 2 2 2 2
# 7 1007 2.88 0 0 0 0
# 8 1008 0.32 0 0 0 0
# 9 1009 0.85 0 0 0 0
# 10 1010 2.85 0 0 0 0
# # … with 23 more rows
# # ℹ Use `print(n = ...)` to see more rows
Here is another (ugly) way to do it. Nowhere near as elegant as #langtang's solution but it works...
events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))
events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))
library(stringr)
events$Number <- events$Value == 0
events$tmp <- NA
i <- 0
lapply(2:nrow(events), function(x) {
if ((events[ x, 'Number' ] == TRUE) &
(events[ x - 1, 'Number' ] == FALSE)) {
i <<- i + 1
events[ x, 'tmp' ] <<- i
} else if ((events[ x, 'Number' ] == TRUE) &
(events[ x - 1, 'Number' ] == TRUE)) {
events[ x, 'tmp' ] <<- i
}
}) |>
invisible()
idx <- which(is.na(events$tmp))
events[ idx, 'tmp' ] <- 0
events <- split(events, events$tmp) |>
lapply(function(x) {
if (unique(x$tmp) > 0) {
x$duration <- 1:nrow(x)
} else {
x$duration <- 0
}
x
}) |>
data.table::rbindlist(fill = TRUE) |>
as.data.frame()
idx <- order(events$Frame)
events <- events[ idx, ]
events$Number <- NULL
colnames(events) <- c('Frame', 'Value', 'Number', 'Duration')
rownames(events) <- NULL
print(events)
identical(events, events_final)
I'm trying to minus values for each habitat covariate relative to year 2019 and 2010. So, something that can assign by ID those values belonging to each habitat for 2010 and 2019, minus them, otherwise, those that aren't grouped by ID are left as is in the dataframe.
Here's an example of the dataset and what I expect for the output:
#dataset example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 2010 0.1 0 0 0 0 0 0
8 492 2019 0.1 0 0 0 0 0 0
9 719 2010 0.0769 0 0 0 0 0 0
10 719 2019 0.0769 0 0 0 0 0 0
#output example
# A tibble: 30 x 18
id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~ pland_06_closed~
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 267 2019 0.0833 0 0 0 0 0 0
2 268 2019 0.2 0 0 0 0 0 0
3 362 2019 0.1 0 0 0 0 0 0
4 420 2019 0.0556 0 0 0 0 0 0
5 421 2019 0.0667 0 0 0 0 0 0
6 484 2019 0.125 0 0 0 0 0 0
7 492 changed 0 0 0 0 0 0 0
9 719 changed 0 0 0 0 0 0 0
I can imagine this working with a function and boolean operators such that, if year 2010 & 2019 match by id then minus the next row by the previous (assuming that they're ordered by id then this should work), otherwise, if they do not match by id then leave them as is.
I'm trying to wrap my head around which code to use for this, I can see this working within a function and using lapply to apply across the entire dataset.
Here's a reproducible code:
structure(list(id = c(267L, 268L, 362L, 420L, 421L, 484L, 492L,
492L, 719L, 719L, 986L, 986L, 1071L, 1071L, 1303L, 1303L, 1306L,
1399L, 1399L, 1400L, 1400L, 2007L, 2083L, 2083L, 2134L, 2135L,
2136L, 2213L, 2213L, 2214L), year = c(2019, 2019, 2019, 2019,
2019, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010, 2019, 2010,
2019, 2010, 2010, 2019, 2010, 2019, 2019, 2010, 2019, 2019, 2019,
2019, 2010, 2019, 2010), pland_00_water = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_01_evergreen_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0588235294117647, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_02_evergreen_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_03_deciduous_needleleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0, 0,
0, 0, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_04_deciduous_broadleaf = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0714285714285714, 0.0714285714285714,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_05_mixed_forest = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_06_closed_shrubland = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), pland_07_open_shrubland = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), pland_08_woody_savanna = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), pland_09_savanna = c(0,
0, 0, 0, 0, 0, 0, 0, 0.0769230769230769, 0.0769230769230769,
0.0588235294117647, 0.0588235294117647, 0, 0, 0, 0.0769230769230769,
0.0588235294117647, 0.05, 0.05, 0.111111111111111, 0.111111111111111,
0, 0, 0, 0, 0, 0, 0, 0, 0), pland_10_grassland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.0588235294117647, 0.0714285714285714, 0.0714285714285714, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.05, 0.05, 0.111111111111111,
0.111111111111111, 0.0526315789473684, 0.142857142857143, 0.142857142857143,
0.0666666666666667, 0.0588235294117647, 0.1, 0.142857142857143,
0.142857142857143, 0.25), pland_11_wetland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0, 0, 0.1, 0.1, 0.0769230769230769,
0.0769230769230769, 0.0588235294117647, 0.0588235294117647, 0.0714285714285714,
0.0714285714285714, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0, 0.0526315789473684, 0.142857142857143,
0.142857142857143, 0.0666666666666667, 0.0588235294117647, 0.1,
0.142857142857143, 0.142857142857143, 0), pland_12_cropland = c(0.0833333333333333,
0.2, 0.1, 0.0555555555555556, 0.0666666666666667, 0.125, 0.1,
0.1, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0, 0, 0, 0.0769230769230769, 0.0769230769230769, 0.0588235294117647,
0.05, 0.05, 0.111111111111111, 0.111111111111111, 0.0526315789473684,
0.142857142857143, 0.142857142857143, 0.0666666666666667, 0,
0, 0.142857142857143, 0.142857142857143, 0.25), pland_13_urban = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), pland_14_mosiac = c(0, 0, 0, 0, 0, 0,
0, 0, 0.0769230769230769, 0.0769230769230769, 0, 0.0588235294117647,
0, 0, 0, 0, 0, 0.05, 0.05, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
pland_15_barren = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Here's a tidyverse version:
library(dplyr)
x %>%
arrange(year) %>%
# can add 'id' if desired, minimum 'year' required for below
group_by(id) %>%
filter(
all(c("2010", "2019") %in% year),
year %in% c("2010", "2019")
) %>%
summarize_at(vars(-year), diff) %>%
mutate(year = "changed") %>%
ungroup() %>%
bind_rows(x, .) %>%
arrange(id, year) # just to show id=492
# # A tibble: 39 x 18
# id year pland_00_water pland_01_evergr~ pland_02_evergr~ pland_03_decidu~ pland_04_decidu~ pland_05_mixed_~
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 267 2019 0.0833 0 0 0 0 0
# 2 268 2019 0.2 0 0 0 0 0
# 3 362 2019 0.1 0 0 0 0 0
# 4 420 2019 0.0556 0 0 0 0 0
# 5 421 2019 0.0667 0 0 0 0 0
# 6 484 2019 0.125 0 0 0 0 0
# 7 492 2010 0.1 0 0 0 0 0
# 8 492 2019 0.1 0 0 0 0 0
# 9 492 chan~ 0 0 0 0 0 0
# 10 719 2010 0.0769 0 0 0 0 0
# # ... with 29 more rows, and 10 more variables: pland_06_closed_shrubland <dbl>, pland_07_open_shrubland <dbl>,
# # pland_08_woody_savanna <dbl>, pland_09_savanna <dbl>, pland_10_grassland <dbl>, pland_11_wetland <dbl>,
# # pland_12_cropland <dbl>, pland_13_urban <dbl>, pland_14_mosiac <dbl>, pland_15_barren <dbl>
Explanation:
the first arrange(year) is so that the diff later will have values in an expected order (assuming all years are year-like that sort lexicographically the same as a numerical sort);
the filter first removes any ids that do not have both years, and then ensures we have only those two years; while your data only contains "2010" and "2019", I didn't want to assume that ... it's a harmless filter if that's all you have, remove year %in% c("2010","2019") if desired and safe;
I assume that columns other than id and year are numeric/integer, so summarize_at(vars(-year), diff) is safe (id is out of the picture since it is a grouping variable); if there are non-numerical values, you might be able to use summarize_if(is.numeric, diff) which also works here ... but will silently NA-ize non-numeric fields if present;
bind_rows(x, .) is needed because the filter removed many rows we want/need to retain; and
the last arrange(id,year) is solely demonstrative for this answer.
I have two data frames:
> df1
2013-04-1 2013-04-2 2013-04-3 2013-04-4 2013-04-5 2013-04-6 2013-04-7 2013-04-8 2013-04-9 2013-04-10 2013-04-11
bin_1 32 489 32 32 364 19 312 0 0 0 346
bin_2 8 346 8 0 98 8 12 12 46 364 346
bin_3 9 98 346 46 9 312 6 1912 0 489 0
bin_4 4 12 9 12 0 12 0 987 9 19 12
bin_5 0 0 8 8 0 0 312 6 312 12 4
df1 contains 5 rows (bins) and 23 columns (date)
> df2
orange apple pear banana watermelon lemon
2013-04-1 1 1 1 1 0 1
2013-04-2 1 1 0 1 0 0
2013-04-3 1 1 1 1 0 1
2013-04-4 0 1 0 1 1 1
2013-04-5 1 0 0 0 1 1
df2 contains 23 rows(date) and 6 columns (types of fruits)
So now, I want to concentrate these 2 dfs into 1 big data frame that contains all the information, like:
> df3
orange apple pear banana watermelon lemon
bin_1 ? ? ? ? ? ?
bin_2 ? ? ? ? ? ?
bin_3 ? ? ? ? ? ?
bin_4 ? ? ? ? ? ?
bin_5 ? ? ? ? ? ?
But how can i concentrate the data? So for example,
on 2013-04-1,
bin_1 contains 32 fruits, bin_2 contains 8 fruits, ..., bin_5 contains 0 fruits (based on df1)
only orange, apple, pear, banana, and lemon are available (based on df2)
Q. I want my df3 to contain concentrate information, like bin_1 on average contain x amount of oranges, ...etc .How can I model this?
Code:
> dput(df1)
structure(list(`2013-04-1` = c(32, 8, 9, 4, 0), `2013-04-2` = c(489,
346, 98, 12, 0), `2013-04-3` = c(32, 8, 346, 9, 8), `2013-04-4` = c(32,
0, 46, 12, 8), `2013-04-5` = c(364, 98, 9, 0, 0), `2013-04-6` = c(19,
8, 312, 12, 0), `2013-04-7` = c(312, 12, 6, 0, 312), `2013-04-8` = c(0,
12, 1912, 987, 6), `2013-04-9` = c(0, 46, 0, 9, 312), `2013-04-10` = c(0,
364, 489, 19, 12), `2013-04-11` = c(346, 346, 0, 12, 4), `2013-04-12` = c(0,
9, 12, 46, 489), `2013-04-13` = c(32, 8, 19, 46, 0), `2013-04-14` = c(0,
987, 12, 0, 6), `2013-04-15` = c(0, 346, 4, 346, 0), `2013-04-16` = c(0,
1912, 1912, 12, 364), `2013-04-17` = c(12, 98, 32, 32, 1912),
`2013-04-18` = c(12, 12, 12, 0, 346), `2013-04-19` = c(9,
46, 98, 312, 4), `2013-04-20` = c(32, 987, 46, 9, 312), `2013-04-21` = c(4,
98, 12, 32, 12), `2013-04-22` = c(19, 0, 4, 346, 0), `2013-04-23` = c(1912,
364, 0, 0, 489)), row.names = c("bin_1", "bin_2", "bin_3",
"bin_4", "bin_5"), class = "data.frame")
> dput(df2)
structure(list(orange = c(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0), apple = c(1, 1, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), pear = c(1,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
0), banana = c(1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0), watermelon = c(0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), lemon = c(1, 0,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0
)), row.names = c("2013-04-1", "2013-04-2", "2013-04-3", "2013-04-4",
"2013-04-5", "2013-04-6", "2013-04-7", "2013-04-8", "2013-04-9",
"2013-04-10", "2013-04-11", "2013-04-12", "2013-04-13", "2013-04-14",
"2013-04-15", "2013-04-16", "2013-04-17", "2013-04-18", "2013-04-19",
"2013-04-20", "2013-04-21", "2013-04-22", "2013-04-23"), class = "data.frame")
Example:
my_diamonds <- diamonds %>% fastDummies::dummy_cols(select_columns = "color", remove_selected_columns = T)
my_diamonds %>% glimpse
Looks like this:
Observations: 53,940
Variables: 16
$ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.30, 0.23, 0.22, 0.31, 0.20, 0.32, 0.30, 0.30, 0.30, 0.30, 0.30, 0.23, 0.2…
$ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Very Good, Fair, Very Good, Good, Ideal, Premium, Ideal, Premium, Premium, I…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, SI1, SI2, SI2, I1, SI2, SI1, SI1, SI1, SI2, VS2, VS1, SI1, SI1, VVS2, VS1…
$ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64.0, 62.8, 60.4, 62.2, 60.2, 60.9, 62.0, 63.4, 63.8, 62.7, 63.3, 63.8, 61.…
$ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58, 54, 54, 56, 59, 56, 55, 57, 62, 62, 58, 57, 57, 61, 57, 57, 57, 59, 58,…
$ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 342, 344, 345, 345, 348, 351, 351, 351, 351, 352, 353, 353, 353, 354, 355, …
$ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.25, 3.93, 3.88, 4.35, 3.79, 4.38, 4.31, 4.23, 4.23, 4.21, 4.26, 3.85, 3.9…
$ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.28, 3.90, 3.84, 4.37, 3.75, 4.42, 4.34, 4.29, 4.26, 4.27, 4.30, 3.92, 3.9…
$ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.73, 2.46, 2.33, 2.71, 2.27, 2.68, 2.68, 2.70, 2.71, 2.66, 2.71, 2.48, 2.4…
$ color_D <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, …
$ color_E <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_F <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_G <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ color_H <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
$ color_I <int> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
$ color_J <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
Is there an out of the box, non custom function way to get my_diamonds back into it's original form with a single column for 'color'?
You can use pivot_longer :
library(dplyr)
tidyr::pivot_longer(my_diamonds, cols = starts_with('color'),
names_to = 'color',
names_pattern = '.*_(.*)') %>%
filter(value == 1) %>%
select(-value)
# A tibble: 53,940 x 10
# carat cut clarity depth table price x y z color
# <dbl> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>
# 1 0.23 Ideal SI2 61.5 55 326 3.95 3.98 2.43 E
# 2 0.21 Premium SI1 59.8 61 326 3.89 3.84 2.31 E
# 3 0.23 Good VS1 56.9 65 327 4.05 4.07 2.31 E
# 4 0.290 Premium VS2 62.4 58 334 4.2 4.23 2.63 I
# 5 0.31 Good SI2 63.3 58 335 4.34 4.35 2.75 J
# 6 0.24 Very Good VVS2 62.8 57 336 3.94 3.96 2.48 J
# 7 0.24 Very Good VVS1 62.3 57 336 3.95 3.98 2.47 I
# 8 0.26 Very Good SI1 61.9 55 337 4.07 4.11 2.53 H
# 9 0.22 Fair VS2 65.1 61 337 3.87 3.78 2.49 E
#10 0.23 Very Good VS1 59.4 61 338 4 4.05 2.39 H
# … with 53,930 more rows
Another option using max.col:
col <- "color"
my_diamonds$color <- my_diamonds %>%
select(starts_with(col)) %>%
{gsub(paste0(col,"_"), "", names(.))[max.col(.)]}
A base R option:
cols <- sub("color_", "", grep("^color_", names(my_diamonds), value=TRUE)); cols
[1] "D" "E" "F" "G" "H" "I" "J"
my_diamonds$color <- cols[
apply(my_diamonds[,grep("^color_", names(my_diamonds))], 1, which.max]
all(my_diamonds$color==diamonds$color)
#[1] TRUE
Or using the much quicker max.col (thanks #chinsoon12):
my_diamonds$color <- cols[max.col(my_diamonds[,grep("^color_", names(my_diamonds))])]
all(my_diamonds$color == diamonds$color)
#[1] TRUE
I am having a difficult time scraping data tables from [iea.org][1]. I use the following code :
library("rvest")
url <- "http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013"
energy <- url %>%
html() %>%
html_nodes(xpath='//*[#id="stats-container"]/div[2]/table') %>%
html_table()
head(energy)
Instead of having numbers in the cells of the table, the resulting table in R only contains letters.
Thanks for the help in advance.
Until proven otherwise (or the site owners read up on how to use robots.txt and find a real lawyer to craft more explicit & restrictive T&Cs)…
I'll start with a non-"tidyverse" solution for this answer:
library(rvest)
x <- read_html("http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013")
# find the table; note that a less "structural" selector will generally make
# scraping code a bit less fragile.
xdf <- html_node(x, xpath=".//table[contains(., 'International marine')]")
xdf <- html_table(xdf)
# clean up column names
xdf <- janitor::clean_names(xdf)
Now, the columns are encoded as noted by the OP and in the question comment discussions:
xdf$oil_products
## [1] "MA==" "Mzkx" "LTUw" "MA==" "LTUy" "MA==" "Mjkw" "MA==" "MQ==" "LTEw"
## [11] "MA==" "MA==" "MA==" "NjAx" "MA==" "MA==" "MA==" "LTE1" "MA==" "ODY2"
## [21] "MzQ2" "MzMy" "MTI0" "Nw==" "NDI=" "MjY=" "MA==" "NTA=" "NjM=" "MA=="
The == gives it away as base64 encoded (though the URL mentioned in the comments further confirms this). They encoded each character so we need to convert them from b64 first then convert to numeric:
# decode each column
lapply(xdf[2:12], function(.x) {
as.numeric(
sapply(.x, function(.y) {
rawToChar(openssl::base64_decode(.y))
}, USE.NAMES=FALSE)
)
}) -> xdf[2:12]
A quick str() alternative view:
tibble::glimpse(xdf)
## Observations: 30
## Variables: 12
## $ x <chr> "Production", "Imports", "Exports", "International marine bunkers***", "International aviation bunkers***", "Stock c...
## $ coal <dbl> 88, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ crude_oil <dbl> 0, 618, 0, 0, 0, 21, 639, 0, 0, 0, 0, 0, 0, -639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ oil_products <dbl> 0, 391, -50, 0, -52, 0, 290, 0, 1, -10, 0, 0, 0, 601, 0, 0, 0, -15, 0, 866, 346, 332, 124, 7, 42, 26, 0, 50, 63, 0
## $ natural_gas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ nuclear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ hydro <dbl> 1142, 0, 0, 0, 0, 0, 1142, 0, 0, -1142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ geothermal_solar_etc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ biofuels_and_waste <dbl> 7579, 0, 0, 0, 0, 0, 7579, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1661, 0, 0, 5918, 1479, 0, 4438, 4438, 0, 0, 0, 0, 0, 0
## $ electricity <dbl> 0, 6, -93, 0, 0, 0, -87, 0, 0, 1144, 0, 0, 0, 0, 0, 0, 0, -26, -98, 933, 549, 2, 382, 289, 59, 23, 0, 10, 0, 0
## $ heat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ total <dbl> 8809, 1016, -143, 0, -52, 21, 9651, 0, 1, -9, 0, 0, 0, -39, 0, 0, -1661, -41, -98, 7805, 2462, 335, 4945, 4734, 101,...
And an enhanced print:
tibble::as_tibble(xdf)
## # A tibble: 30 x 12
## x coal crude_oil oil_products natural_gas nuclear hydro geothermal_solar_etc biofuels_and_waste electricity heat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Production 88 0 0 0 0 1142 0 7579 0 0
## 2 Imports 0 618 391 0 0 0 0 0 6 0
## 3 Exports 0 0 -50 0 0 0 0 0 -93 0
## 4 International marine bunkers*** 0 0 0 0 0 0 0 0 0 0
## 5 International aviation bunkers*** 0 0 -52 0 0 0 0 0 0 0
## 6 Stock changes 0 21 0 0 0 0 0 0 0 0
## 7 TPES 88 639 290 0 0 1142 0 7579 -87 0
## 8 Transfers 0 0 0 0 0 0 0 0 0 0
## 9 Statistical differences 0 0 1 0 0 0 0 0 0 0
## 10 Electricity plants 0 0 -10 0 0 -1142 0 0 1144 0
## # ... with 20 more rows, and 1 more variables: total <dbl>
The tidyverse is a bit cleaner:
decode_cols <- function(.x) {
map_dbl(.x, ~{
openssl::base64_decode(.x) %>%
rawToChar() %>%
as.numeric()
})
}
html_node(x, xpath=".//table[contains(., 'International marine')]") %>%
html_table() %>%
janitor::clean_names() %>%
mutate_at(vars(-x), decode_cols)