Selecting rows from data based on unique conditions - r

dat <- structure(list(doy = c(274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336,
337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357,
358, 359, 360, 361, 362, 363, 364, 365),
no.plant = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
cum.value = c(0, 1.34973713866726e-05, 0.000107973870218436, 0.000364365089792096, 0.000863464598244823, 0.00168576031682954,
0.00291120609231443, 0.00291120609231443, 0.0046189294134239, 0.00688687680728461, 0.00688687680728461,
0.00979139917551386, 0.0134067801825104, 0.0178047117788614, 0.0230537220148601, 0.0292185614529241,
0.0292185614529241, 0.0363595556987137, 0.0363595556987137, 0.0445319328097977, 0.0537851355741434,
0.0641621298405947, 0.0756987211882645, 0.0884228931969177, 0.102354181379628, 0.102354181379628, 0.117503097415405,
0.133870618627253, 0.151447757647197, 0.151447757647197, 0.170215226855778, 0.170215226855778,
0.190143211447851, 0.211191263836225, 0.233308330547831, 0.256432920794094, 0.280493423522773, 0.305408577012532,
0.331088091999851, 0.357433425992349, 0.384338702900249, 0.411691768499651, 0.439375368630229, 0.467268433537531,
0.495247448513112, 0.523187888081939, 0.550965688550059, 0.578458731861707, 0.605548312515632, 0.632120558828558,
0.658067780159839, 0.683289712849355, 0.707694639565394, 0.731200359474982, 0.753734990069534, 0.753734990069534,
0.753734990069534, 0.753734990069534, 0.775237585508182, 0.795658560857758, 0.814959916467899, 0.833115261761304,
0.850109642771837, 0.865939182653005, 0.865939182653005, 0.880610548937487, 0.894140265397845, 0.906553889802375,
0.917885081566473, 0.928174585188328, 0.93746915638157, 0.945820457966355, 0.95328395187962, 0.959917812174526,
0.965781881688334, 0.970936692282333, 0.975442565331355, 0.97935880560985, 0.97935880560985, 0.982742998037354,
0.985650413056059, 0.988133522855331, 0.990241627354782, 0.992020585910824, 0.993512648199701, 0.994756375705273,
0.995786643728671, 0.996634712840931, 0.997328358197721, 0.997892045086969, 0.998347139430071, 0.998347139430071)),
class = "data.frame", row.names = c(NA, -92L))
delta <- 0.04991736
I need to select those doy where the cum.value reaches 1*delta, 2*delta, 3*delta, 4*delta ....n*delta and also
include last doy which is 365 if n*delta does not reach the doy 365.
At the moment I am selecting n by trial and error which is by first creating a sequencnce of 1:n. For e.g 1:19:
qt.vec.19 <- 1:19 * delta
max(qt.vec.19) >= max(dat$cum.value)
FALSE
If I change qt.vec to 1:20
qt.vec.20 <- 1:20 * delta
max(qt.vec.20) >= max(dat$cum.value)
TRUE
This means that I can do 1*delta, 2*delta....19*delta and then also select the last doy.
sample.dat <- dat %>% dplyr::slice(unique(c(which.max(cum.value > qt.vec.19[1]),
which.max(cum.value > qt.vec.19[2]),
which.max(cum.value > qt.vec.19[3]),
which.max(cum.value > qt.vec.19[4]),
which.max(cum.value > qt.vec.19[5]),
which.max(cum.value > qt.vec.19[6]),
which.max(cum.value > qt.vec.19[7]),
which.max(cum.value > qt.vec.19[8]),
which.max(cum.value > qt.vec.19[9]),
which.max(cum.value > qt.vec.19[10]),
which.max(cum.value > qt.vec.19[11]),
which.max(cum.value > qt.vec.19[12]),
which.max(cum.value > qt.vec.19[13]),
which.max(cum.value > qt.vec.19[14]),
which.max(cum.value > qt.vec.19[15]),
which.max(cum.value > qt.vec.19[16]),
which.max(cum.value > qt.vec.19[17]),
which.max(cum.value > qt.vec.19[18]),
which.max(cum.value > qt.vec.19[19]))))
last.doy <- dat %>% dplyr::filter(doy == 365)
all.doy <- as.data.frame(rbind(sample.dat, last.doy))
doy no.plant cum.value
294 0 0.05378514
298 0 0.10235418
302 0 0.15144776
307 0 0.21119126
309 0 0.25643292
311 0 0.30540858
313 0 0.35743343
315 0 0.41169177
317 0 0.46726843
319 0 0.52318789
320 0 0.55096569
322 0 0.60554831
324 0 0.65806778
326 0 0.70769464
328 0 0.75373499
334 0 0.81495992
336 0 0.85010964
341 0 0.90655389
346 0 0.95328395
365 1 0.99834714
I was wondering if there's any better way to do this like selecting what my n value should be or avoid the long slice(unique(... part?

A matter of taste and context and you read a lot about "loops are frowned upon in R" - but they deliver results and are easy to read, and they are Base R - no extra packages needed or new syntax to learn:
options( scipen = 10, digits = 15 ) # display all digits
dat <- read.csv( "crop89.csv" ) # load your data from a file
delta <- 0.04991736 # selected threshold
n <- 1 # initiate multiplier variable
all.doy <- dat[ 1, ] # initiate receiving data.frame
for( i in 1:length( dat$doy ) ){ # loop through dat rows
if( dat[ i, "cum.value"] >= n * delta ){ # as soon as threshold is passed
all.doy[ n, ] <- dat[ i, ] # write the line to the target data.frame
n <- n + 1 # increment multiplier
}
}
all.doy[ n, ] <- dat[ i, ] # add the last row anyway
all.doy
> all.doy
doy no.plant cum.value
1 294 0 0.0537851355741434
25 298 0 0.1023541813796280
29 302 0 0.1514477576471970
34 307 0 0.2111912638362250
36 309 0 0.2564329207940940
38 311 0 0.3054085770125320
40 313 0 0.3574334259923490
42 315 0 0.4116917684996510
44 317 0 0.4672684335375310
46 319 0 0.5231878880819389
47 320 0 0.5509656885500590
49 322 0 0.6055483125156320
51 324 0 0.6580677801598390
53 326 0 0.7076946395653940
55 328 0 0.7537349900695340
61 334 0 0.8149599164678990
63 336 0 0.8501096427718370
68 341 0 0.9065538898023749
73 346 0 0.9532839518796200
92 365 1 0.9983471394300710

The main point is the cut function here:
library(data.table)
DT<-as.data.table(dat)
DT[,group:=as.numeric(cut(cum.value,c(-Inf,qt.vec.19,Inf),ordered_result = T))-1]
DT[,position:=frank(cum.value,ties.method = "first" ),by=group]
DT<-DT[position==1 & group>0]
DT[,position:=NULL]
DT[,group:=NULL]
if (max(DT$cum.value)!=max(dat$cum.value)) DT<-rbind(DT,dat[dat$doy==max(dat$doy),])

Related

modify function to find the maximum length for a consecutive run of a number per row

I would like to modify the answer to the question here or have a new solution to include another column which shows the second largest consecutive run of "0". My sample data and code is below, the function is operating on the month columns and the second largest run column is what I hope to add. I am working with a large dataset so the more efficient the better, any ideas are appreciated, thanks.
sample data
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `Second largest run` = c(1,
NA, NA, NA, NA, 2, NA, NA, NA), result = c(2, 4, -Inf, 1,
-Inf, 5, -Inf, 3, -Inf)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
code
most_consecutive_val = function(x, val = 0) {
with(rle(x), max(lengths[values == val]))
}
test$result=apply(test[,-c(1:4,17)], MARGIN = 1, most_consecutive_val)
Rather than taking the max from the run length encoding (rle) function, we want to sort the output and then extract the desired index. We'll get NA's when we request an index that doesn't exist -- where there isn't a second run of zeroes in row 2 for example.
ordered_runs = function(x, val = 0, idx = 1) {
with(rle(x), sort(lengths[values == val], decreasing = TRUE))[idx]
}
test$result_1 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 1)
test$result_2 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 2)
Output is slightly different than your expected -- (1) using NA's rather than -Inf, and (2) in your first row, where I believe there is a tie with a second run of 2 zeroes.
> test[,c(1,17:20)]
# A tibble: 9 x 5
ID `Second largest run` result result_1 result_2
<dbl> <dbl> <dbl> <int> <int>
1 1 1 2 2 2
2 2 NA 4 4 NA
3 3 NA -Inf NA NA
4 4 NA 1 1 NA
5 5 NA -Inf NA NA
6 6 2 5 5 2
7 7 NA -Inf NA NA
8 8 NA 3 3 NA
9 9 NA -Inf NA NA
Here is an option using data.table which should be quite fast for OP's large dataset and also identifies all sequences of zeros simultaneously:
library(data.table)
setDT(DF)
cols <- c("Jan", "Feb", "March", "April", "May", "June", "July", "August", "Sep", "Oct", "Nov", "Dec")
#convert into a long format
m <- melt(DF, measure.vars=cols)[
#identify consecutive sequences of the same number and count
order(ID), c("rl", "rw") := .(rl <- rleid(ID, value), rowid(rl))][
#extract the last element where values = 0 (that is the length of sequences of zeros)
value == 0L, .(ID=ID[.N], len=rw[.N]), rl][
#sort in descending order for length of sequences
order(ID, -len)]
#pivot into wide format and perform a update join
wide <- dcast(m, ID ~ rowid(ID), value.var="len")
DF[wide, on=.(ID), (names(wide)) := mget(names(wide))]
output:
ID V1 V2 V3 Jan Feb March April May June July August Sep Oct Nov Dec 1 2
1: 1 A 21 149 10 20 0 0 25 29 34 37 0 0 48 50 2 2
2: 2 B 233 250 20 25 28 34 0 0 0 0 39 48 55 60 4 NA
3: 3 A 185 218 10 15 20 25 30 35 40 45 50 55 60 65 NA NA
4: 4 B 85 104 12 0 14 16 22 24 28 29 31 35 40 45 1 NA
5: 5 B 208 62 76 89 108 125 135 145 163 173 193 210 221 239 NA NA
6: 6 A 112 19 28 0 0 71 0 0 0 0 0 163 183 195 5 2
7: 7 A 238 175 137 152 165 181 191 205 217 228 239 252 272 289 NA NA
8: 8 B 66 168 162 177 194 208 224 244 256 276 308 0 0 0 3 NA
9: 9 B 38 28 101 119 132 149 169 187 207 221 236 247 264 277 NA NA
data:
DF <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `1` = c(2L,
4L, NA, 1L, NA, 5L, NA, 3L, NA), `2` = c(2L, NA, NA, NA,
NA, 2L, NA, NA, NA)), row.names = c(NA, -9L), class = "data.frame")

Converting data frame into a list of lists in R

I would like to turn data.frame like this one:
dat = data.frame (
ConditionA = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
ConditionB = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5),
X = c(460, 382, 468, 618, 421, 518, 655, 656, 621, 552, 750, 725, 337, 328, 342, 549, 569, 523, 469, 429),
Y = c(437, 305, 498, 620, 381, 543, 214, 181, 183, 387, 439, 351, 327, 268, 276, 178, 375, 393, 312, 302)
)
into a list of lists like this (or similar):
lst = list(
list(
c(460, 382, 468, 618),
c(437, 305, 498, 620)
),
list(
c(421, 518, 655, 656, 621),
c(381, 543, 214, 181, 183)
),
list(
c(552, 750, 725),
c(387, 439, 351)
),
list(
c(337, 328, 342, 549),
c(327, 268, 276, 178)
),
list(
c(569, 523, 469, 429),
c(375, 393, 312, 302)
)
)
> lst
[[1]]
[[1]][[1]]
[1] 460 382 468 618
[[1]][[2]]
[1] 437 305 498 620
[[2]]
[[2]][[1]]
[1] 421 518 655 656 621
[[2]][[2]]
[1] 381 543 214 181 183
[[3]]
[[3]][[1]]
[1] 552 750 725
[[3]][[2]]
[1] 387 439 351
. . .
What would be the most efficient way to make such a conversion?
We can do a split based on the 1st and 2nd columns, use drop=TRUE for removing the combinations with 0 elements and convert to list
lapply(split(dat[-(1:2)], dat[1:2], drop = TRUE), as.list)
Or using tidyverse
library(tidyverse)
dat %>%
group_by(ConditionA, ConditionA.1) %>%
nest %>%
mutate(data = map(data, as.list)) %>%
pull(data)
May be this using data.table
Data:
dat = data.frame (
ConditionA = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
ConditionB = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5),
X = c(460, 382, 468, 618, 421, 518, 655, 656, 621, 552, 750, 725, 337, 328, 342, 549, 569, 523, 469, 429),
Y = c(437, 305, 498, 620, 381, 543, 214, 181, 183, 387, 439, 351, 327, 268, 276, 178, 375, 393, 312, 302)
)
Code:
library('data.table')
setDT(dat)
dat[, list(list(as.list(.SD))),by = .(ConditionA, ConditionB)][, V1]
or this
dat[, list(list(list(.SD))),by = .(ConditionA, ConditionB)][, V1]
c(by(dat[3:4],dat[1:2],as.list))
[[1]]
[[1]]$X
[1] 460 382 468 618
[[1]]$Y
[1] 437 305 498 620
[[2]]
[[2]]$X
[1] 421 518 655 656 621
[[2]]$Y
[1] 381 543 214 181 183
[[3]]
[[3]]$X
[1] 552 750 725
[[3]]$Y
[1] 387 439 351
. . . .

identifying rows following specifc row based on conditions

I have a dataframe/tibble containing yearly observations of several countries. In years in which a specific event happens the variable event gets the value 1.
I am now trying to specify a new column event.10yrs which gets the value 1 for the 9 years following the end of an event (= last year of the event if event lasts several years). In years in which a new event occurs and which are not the last year of the new event, the new column event.10yrs gets the value 0.
Below the data for one single country. Column event.10yrs is the desired output.
df <-structure(list(year = c(1970, 1971, 1972, 1973, 1974, 1975, 1976,
1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
2010, 2011, 2012, 2013, 2014, 2015), ccode = c(516, 516, 516,
516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516,
516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516,
516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516, 516,
516, 516, 516, 516), event = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA), event.last.y = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, NA,
NA, NA, NA, NA), event.10yrs = c(NA, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, NA, NA, NA)), row.names = c(NA,
-46L), vars = "ccode", drop = TRUE, class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), indices = list(0:45), group_sizes = 46L, biggest_group_size = 46L, labels = structure(list(
ccode = 516), row.names = c(NA, -1L), vars = "ccode", drop = TRUE, class = "data.frame", .Names = "ccode"), .Names = c("year",
"ccode", "event", "event.last.y", "event.10yrs"))
My attempt so far using the dplyr package:
df <- df %>%
mutate(event.10yrs=case_when(event!=1 & year-9 < year[event.last.y==1] ~ 1,
TRUE ~ 0))
This, however, renders the following warning:
Warning message:
In year < year[rs.war.last.y == 1] :
longer object length is not a multiple of shorter object length
Grateful for any hint.
Maybe just a nested ifelse (or dplyr::if_else)
require(dplyr)
df %>% mutate(ev_10 = if_else(event == 0, 1,
if_else(event.last.y ==1, 1, 0),
0))
edit
this post helped me here : Find the index position of the first non-NA value in an R vector?
But we want to replace not only for the first occurrence of 'x' ...
so I made a little workaround with a helper column
index_1 <- unlist(lapply(which(df$event.last.y ==1 ),
function(x) seq(x, length.out=9)))
# this makes a vector with all the index of the last 9 positions
# after the last value == 1
df$last_code <- df$event.last.y #just to duplicate your column
df$last_code[index_1] <- 1 #replacing the indices with '1'
Now we can use the simple nested conditional statement as before
df <- df %>% mutate(ev_10 = if_else(event == 0 & last_code==1, 1,
#added the condition that last_code needs to be '1'
if_else(event.last.y ==1, 1, 0),
0))
head(df[c(2:13, 31:40),], 20) #printing only example rows here
# A tibble: 20 x 7
# Groups: ccode [1]
year ccode event event.last.y event.10yrs last_code ev_10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1971 516 0 0 0 0 0
2 1972 516 1.00 1.00 1.00 1.00 1.00
3 1973 516 0 0 1.00 1.00 1.00
4 1974 516 0 0 1.00 1.00 1.00
5 1975 516 0 0 1.00 1.00 1.00
6 1976 516 0 0 1.00 1.00 1.00
7 1977 516 0 0 1.00 1.00 1.00
8 1978 516 0 0 1.00 1.00 1.00
9 1979 516 0 0 1.00 1.00 1.00
10 1980 516 0 0 1.00 1.00 1.00
11 1981 516 0 0 1.00 0 0
12 1982 516 0 0 0 0 0
...
13 2000 516 1.00 0 0 1.00 0
14 2001 516 1.00 0 0 1.00 0
15 2002 516 1.00 0 0 1.00 0
16 2003 516 1.00 1.00 1.00 1.00 1.00
17 2004 516 0 0 1.00 1.00 1.00
18 2005 516 0 0 1.00 1.00 1.00
19 2006 516 0 0 1.00 1.00 1.00
20 2007 516 0 0 1.00 1.00 1.00

Why do I get strings instead of integers when I scrape an HTML table in R?

I am having a difficult time scraping data tables from [iea.org][1]. I use the following code :
library("rvest")
url <- "http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013"
energy <- url %>%
html() %>%
html_nodes(xpath='//*[#id="stats-container"]/div[2]/table') %>%
html_table()
head(energy)
Instead of having numbers in the cells of the table, the resulting table in R only contains letters.
Thanks for the help in advance.
Until proven otherwise (or the site owners read up on how to use robots.txt and find a real lawyer to craft more explicit & restrictive T&Cs)…
I'll start with a non-"tidyverse" solution for this answer:
library(rvest)
x <- read_html("http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013")
# find the table; note that a less "structural" selector will generally make
# scraping code a bit less fragile.
xdf <- html_node(x, xpath=".//table[contains(., 'International marine')]")
xdf <- html_table(xdf)
# clean up column names
xdf <- janitor::clean_names(xdf)
Now, the columns are encoded as noted by the OP and in the question comment discussions:
xdf$oil_products
## [1] "MA==" "Mzkx" "LTUw" "MA==" "LTUy" "MA==" "Mjkw" "MA==" "MQ==" "LTEw"
## [11] "MA==" "MA==" "MA==" "NjAx" "MA==" "MA==" "MA==" "LTE1" "MA==" "ODY2"
## [21] "MzQ2" "MzMy" "MTI0" "Nw==" "NDI=" "MjY=" "MA==" "NTA=" "NjM=" "MA=="
The == gives it away as base64 encoded (though the URL mentioned in the comments further confirms this). They encoded each character so we need to convert them from b64 first then convert to numeric:
# decode each column
lapply(xdf[2:12], function(.x) {
as.numeric(
sapply(.x, function(.y) {
rawToChar(openssl::base64_decode(.y))
}, USE.NAMES=FALSE)
)
}) -> xdf[2:12]
A quick str() alternative view:
tibble::glimpse(xdf)
## Observations: 30
## Variables: 12
## $ x <chr> "Production", "Imports", "Exports", "International marine bunkers***", "International aviation bunkers***", "Stock c...
## $ coal <dbl> 88, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ crude_oil <dbl> 0, 618, 0, 0, 0, 21, 639, 0, 0, 0, 0, 0, 0, -639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ oil_products <dbl> 0, 391, -50, 0, -52, 0, 290, 0, 1, -10, 0, 0, 0, 601, 0, 0, 0, -15, 0, 866, 346, 332, 124, 7, 42, 26, 0, 50, 63, 0
## $ natural_gas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ nuclear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ hydro <dbl> 1142, 0, 0, 0, 0, 0, 1142, 0, 0, -1142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ geothermal_solar_etc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ biofuels_and_waste <dbl> 7579, 0, 0, 0, 0, 0, 7579, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1661, 0, 0, 5918, 1479, 0, 4438, 4438, 0, 0, 0, 0, 0, 0
## $ electricity <dbl> 0, 6, -93, 0, 0, 0, -87, 0, 0, 1144, 0, 0, 0, 0, 0, 0, 0, -26, -98, 933, 549, 2, 382, 289, 59, 23, 0, 10, 0, 0
## $ heat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ total <dbl> 8809, 1016, -143, 0, -52, 21, 9651, 0, 1, -9, 0, 0, 0, -39, 0, 0, -1661, -41, -98, 7805, 2462, 335, 4945, 4734, 101,...
And an enhanced print:
tibble::as_tibble(xdf)
## # A tibble: 30 x 12
## x coal crude_oil oil_products natural_gas nuclear hydro geothermal_solar_etc biofuels_and_waste electricity heat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Production 88 0 0 0 0 1142 0 7579 0 0
## 2 Imports 0 618 391 0 0 0 0 0 6 0
## 3 Exports 0 0 -50 0 0 0 0 0 -93 0
## 4 International marine bunkers*** 0 0 0 0 0 0 0 0 0 0
## 5 International aviation bunkers*** 0 0 -52 0 0 0 0 0 0 0
## 6 Stock changes 0 21 0 0 0 0 0 0 0 0
## 7 TPES 88 639 290 0 0 1142 0 7579 -87 0
## 8 Transfers 0 0 0 0 0 0 0 0 0 0
## 9 Statistical differences 0 0 1 0 0 0 0 0 0 0
## 10 Electricity plants 0 0 -10 0 0 -1142 0 0 1144 0
## # ... with 20 more rows, and 1 more variables: total <dbl>
The tidyverse is a bit cleaner:
decode_cols <- function(.x) {
map_dbl(.x, ~{
openssl::base64_decode(.x) %>%
rawToChar() %>%
as.numeric()
})
}
html_node(x, xpath=".//table[contains(., 'International marine')]") %>%
html_table() %>%
janitor::clean_names() %>%
mutate_at(vars(-x), decode_cols)

remove every 4 rows between two numbers

for example, I have a data frame with one column containing numbers. these is how it looks.
head(c1)
c
1 300
2 302
3 304
4 306
5 308
6 310
Here is the sample data frame.
c1 <- structure(list(c = c(300, 302, 304, 306, 308, 310, 312, 314,
316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340,
342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366,
368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392,
394, 396, 398, 400)), .Names = "c", row.names = c(NA, -51L), class = "data.frame")
I want to delete the rows between 300 to 310 and 310 to 320 and so on..
I want to have a dataframe like these
300
310
320
330
340
350
.
.
.
400
Any ideas how to do these, I found how to remove every nth row, but not every four rows between two numbers
You can make use of the modulo operator %%. If you want the result as an atomic vector, you can run
c1$c[c1$c %% 10 == 0]
or if you want it as a data.frame with 1 column, you can use
c1[c1$c %% 10 == 0, , drop=FALSE]

Resources