Clean a dataset that includes some character in each row - r

I have a dataset like this:
structure(list(`Frequency
Percent` = c("car", "window", "ball",
"ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00",
"218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01",
"4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01",
"0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA,
-4L))
How can I split each row by "" and remove n to make two new columns. For instance, car and AI cell (“2\n0.00”), I will have 2 and 0.00 in two different columns.

One way is to use tidyr::separate in a for loop:
for(i in names(df[,-1])){
df <- tidyr::separate(df, i, sep = "\n", into = c(i, paste0(i,"_val")))
}
Output:
# Frequency\n Percent AI AI_val BLK BLK_val HIANIC HIANIC_val NATRICAN NATRICAN_val UNK UNK_val yy yy_val
# 1 car 2 0.00 0 0.00 1 0.00 9 0.01 15 0.02 111 0.15
# 2 window 3 0.00 218 0.29 8 0.01 7 0.01 83 0.11 897 1.20
# 3 ball 1 0.00 48 0.06 4 0.01 8 0.01 36 0.05 756 1.02
# 4 ups 2 0.00 0 0.00 0 0.00 0 0.00 0 0.00 1 0.00

Using tidyr::separate_rows and tidyr::pivot_wider you could do:
library(tidyr)
library(dplyr)
dat |>
mutate(unit = c("n\npct")) |>
separate_rows(-1, sep = "\n") |>
pivot_wider(names_from = "unit", values_from = -1)
#> # A tibble: 4 × 15
#> Frequency\n…¹ AI_n AI_pct BLK_n BLK_pct HIANI…² HIANI…³ NATRI…⁴ NATRI…⁵ UNK_n
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 car 2 0.00 0 0.00 1 0.00 9 0.01 15
#> 2 window 3 0.00 218 0.29 8 0.01 7 0.01 83
#> 3 ball 1 0.00 48 0.06 4 0.01 8 0.01 36
#> 4 ups 2 0.00 0 0.00 0 0.00 0 0.00 0
#> # … with 5 more variables: UNK_pct <chr>, yy_n <chr>, yy_pct <chr>,
#> # unit_n <chr>, unit_pct <chr>, and abbreviated variable names
#> # ¹​`Frequency\n Percent`, ²​HIANIC_n, ³​HIANIC_pct, ⁴​NATRICAN_n,
#> # ⁵​NATRICAN_pct

A base one liner:
do.call(data.frame, lapply(DF, \(x) do.call(rbind, strsplit(x, "\n"))))
# Frequency.Percent AI.1 AI.2 BLK.1 BLK.2 HIANIC.1 HIANIC.2 NATRICAN.1
#1 car 2 0.00 0 0.00 1 0.00 9
#2 window 3 0.00 218 0.29 8 0.01 7
#3 ball 1 0.00 48 0.06 4 0.01 8
#4 ups 2 0.00 0 0.00 0 0.00 0
# NATRICAN.2 UNK.1 UNK.2 yy.1 yy.2
#1 0.01 15 0.02 111 0.15
#2 0.01 83 0.11 897 1.20
#3 0.01 36 0.05 756 1.02
#4 0.00 0 0.00 1 0.00
Or add also a type convert.
type.convert(do.call(data.frame, lapply(DF, \(x) do.call(rbind, strsplit(x, "\n")))), as.is=TRUE)

There is also a base R solution:
dat = structure(list(`Frequency
Percent` = c("car", "window", "ball",
"ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00",
"218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01",
"4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01",
"0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA,
-4L))
transformed = data.frame(Freq_pc = dat[,1])
for(col in seq(2, ncol(dat))){
transformed = cbind(transformed, t(matrix(unlist(strsplit(dat[,col], "\n")), nrow=2)))
names(transformed)[c(2*(col-1), 2*(col-1)+1)] = c(paste0(names(dat)[col], "_n"), paste0(names(dat)[col], "_pc"))
}
That results in:
Freq_pc AI_n AI_pc BLK_n BLK_pc HIANIC_n HIANIC_pc NATRICAN_n NATRICAN_pc UNK_n UNK_pc yy_n yy_pc
1 car 2 0.00 0 0.00 1 0.00 9 0.01 15 0.02 111 0.15
2 window 3 0.00 218 0.29 8 0.01 7 0.01 83 0.11 897 1.20
3 ball 1 0.00 48 0.06 4 0.01 8 0.01 36 0.05 756 1.02
4 ups 2 0.00 0 0.00 0 0.00 0 0.00 0 0.00 1 0.00

We may use cSplit
library(splitstackshape)
cSplit(df1, 2:ncol(df1), sep = "\n")
-output
Frequency\nPercent AI_1 AI_2 BLK_1 BLK_2 HIANIC_1 HIANIC_2 NATRICAN_1 NATRICAN_2 UNK_1 UNK_2 yy_1 yy_2
1: car 2 0 0 0.00 1 0.00 9 0.01 15 0.02 111 0.15
2: window 3 0 218 0.29 8 0.01 7 0.01 83 0.11 897 1.20
3: ball 1 0 48 0.06 4 0.01 8 0.01 36 0.05 756 1.02
4: ups 2 0 0 0.00 0 0.00 0 0.00 0 0.00 1 0.00

Related

Data manipulation in R - creating a new data frame from an existing one

I have the below data frame in R
id <- c(112, 112,112)
case <- c("up","down","worse")
c1 <- c(0.12,0.24,0.09)
c2 <- c(0.11,0.14,0.06)
c3 <- c(0.15,0.34,0.04)
c4 <- c(0.16,0.44,0.03)
c5 <- c(0.17,0.94,0.01)
df3 <- data.frame(id,case,c1,c2,c3,c4,c5)
I am trying to create a new data frame with column names as id, case, value_in_period, and period
For each id, the period will have values from 0-9. The value_in_period column will take values of c1,c2,c3,c4, and c5 from periods 0-5 and the rest of the values will be 0. A sample of desired output is attached below
I tried using inner join and transpose, but it doesn't seem to work. Any help will be appreciated.
We reshape to 'long' format and then use complete to expand the data
library(dplyr)
library(tidyr)
out <- df3 %>%
pivot_longer(cols = c1:c5, names_to = NULL,
values_to = 'value_in_period') %>%
group_by(id, case = factor(case, levels = unique(case))) %>%
mutate(period = row_number()-1) %>%
complete(period = 0:9, fill = list(value_in_period = 0)) %>%
ungroup %>%
relocate(period, .after = 'value_in_period')
-output
> as.data.frame(out)
id case value_in_period period
1 112 up 0.12 0
2 112 up 0.11 1
3 112 up 0.15 2
4 112 up 0.16 3
5 112 up 0.17 4
6 112 up 0.00 5
7 112 up 0.00 6
8 112 up 0.00 7
9 112 up 0.00 8
10 112 up 0.00 9
11 112 down 0.24 0
12 112 down 0.14 1
13 112 down 0.34 2
14 112 down 0.44 3
15 112 down 0.94 4
16 112 down 0.00 5
17 112 down 0.00 6
18 112 down 0.00 7
19 112 down 0.00 8
20 112 down 0.00 9
21 112 worse 0.09 0
22 112 worse 0.06 1
23 112 worse 0.04 2
24 112 worse 0.03 3
25 112 worse 0.01 4
26 112 worse 0.00 5
27 112 worse 0.00 6
28 112 worse 0.00 7
29 112 worse 0.00 8
30 112 worse 0.00 9

How I can calculate the max for these data

Here is a part of my data
dat<-read.table (text="
Flower A1 A2 A3 TM MN B1 B2 B3
F1 12 9 11 12 0.56 19 1 12
F2 11 16 13 13 0.65 22 4 12
F3 10 12 14 11 0.44 29 9 12
", header=TRUE)
I want to calculate Max for column MN. For example, for value 0.44, the max is max(0.44,1-0.44)= 0.56.
I struggle to get it with a data frame.
Here is the outcome of the interest:
Flower A TM B MN Max
F1 12 12 19 0.56 0.56
F2 11 13 22 0.65 0.65
F3 10 11 29 0.44 0.56
F1 9 12 1 0.56 0.56
F2 16 13 4 0.65 0.65
F3 12 11 9 0.44 0.56
F1 11 12 12 0.56 0.56
F2 13 13 12 0.65 0.65
F3 14 11 12 0.44 0.56
Try the code below
transform(
reshape(
setNames(dat, gsub("(\\d+)", ".\\1", names(dat))),
direction = "long",
idvar = c("Flower", "TM", "MN"),
varying = -c(1, 5, 6)
),
Max = pmax(MN, 1 - MN)
)
which gives
Flower TM MN time A B Max
F1.12.0.56.1 F1 12 0.56 1 12 19 0.56
F2.13.0.65.1 F2 13 0.65 1 11 22 0.65
F3.11.0.44.1 F3 11 0.44 1 10 29 0.56
F1.12.0.56.2 F1 12 0.56 2 9 1 0.56
F2.13.0.65.2 F2 13 0.65 2 16 4 0.65
F3.11.0.44.2 F3 11 0.44 2 12 9 0.56
F1.12.0.56.3 F1 12 0.56 3 11 12 0.56
F2.13.0.65.3 F2 13 0.65 3 13 12 0.65
F3.11.0.44.3 F3 11 0.44 3 14 12 0.56
Using reshape and ave.
reshape(dat, varying=list(2:4, 7:9), direction='long', idvar='Flower') |>
transform(Max=ave(MN, Flower, FUN=max))
# Flower TM MN time A1 B1 Max
# F1.1 F1 12 0.56 1 12 19 0.56
# F2.1 F2 13 0.65 1 11 22 0.65
# F3.1 F3 11 0.44 1 10 29 0.44
# F1.2 F1 12 0.56 2 9 1 0.56
# F2.2 F2 13 0.65 2 16 4 0.65
# F3.2 F3 11 0.44 2 12 9 0.44
# F1.3 F1 12 0.56 3 11 12 0.56
# F2.3 F2 13 0.65 3 13 12 0.65
# F3.3 F3 11 0.44 3 14 12 0.44
Note: R >= 4.1 used.

R group values in column based on intervals and average the results for each interval

I have two tables
table 1:
Dates_only <- data.frame(ID=c('1118','1118','1118','1118','1118',
'1118','1118','1118','1119','1119',
'1119','1119','1119','1119','1119',
'1119','13PP','13PP','13PP','13PP',
'13PP','13PP','13PP','13PP'),
Quart_y=c('2017Q3','2017Q4','2018Q1','2018Q2',
'2018Q3','2018Q4','2019Q1','2019Q2',
'2017Q3','2017Q4','2018Q1','2018Q2',
'2018Q3','2018Q4','2019Q1','2019Q2',
'2017Q3','2017Q4','2018Q1','2018Q2',
'2018Q3','2018Q4','2019Q1','2019Q2'),
Quart=c(0.25,0.50,0.75,1.00,1.25,1.50,1.75,2.00,
0.25,0.50,0.75,1.00,1.25,1.50,1.75,2.00,
0.25,0.50,0.75,1.00,1.25,1.50,1.75,2.00))
and table 2:
Values <- data.frame(ID=c('1118','1119','13PP','1118','1119','13PP',
'1118','1119','13PP','1118','1119','13PP',
'1118','1119','13PP','1118','1119','13PP',
'1118','1119','13PP','1118','1119','13PP',
'1118','1119','13PP','1118','1119','13PP'),
Day=c(0,0,0,0.14,0.13,0.13,0.2,0.23,0.24,0.27,0.28,
0.32,0.32,0.32,0.44,0.47,0.49,0.49,0.59,0.64,
0.61,0.72,0.71,0.73,0.95,0.86,0.78,1.1,0.93,1.15),
Value=c(7.6,6.2,6.8,7.1,6.2,5.9,6.8,5.8,4.6,6.5,5.4,
4.2,6.3,4.8,4,6,4.3,3.8,5.9,4,3.6,5.6,3.8,
3.4,5.4,3.2,3,5,2.9,2.9))
What I am trying to do is to find a way to change the values in Values$Day according to Dates_only$Quart.
Specifically, Dates_only$Quart represent quantified quarters (2017Q3 - 0.25, 2017Q4-0.50,...,2018Q4-1.50) etc. While, Values$Day represents quantified days.
I want to change the Values$Day classified by quarter instead, for example:
for 0<=Values$Day<=0.25 the Values$Day==0.25, for 0.25<Values$Day<=0.50 the Values$Day==0.50 etc.
What I have tried to do is to use this method bellow but it comes up with an error message:
unique_quarters <- unique(Dates_only$Quart)
unique_quarters <- append(unique_quarters, 0, after=0)
df3 <- transform(Dates_only,
Transf_Day=Values$Quart[findInterval(Values$Day, unique_quarters)])
The issue I guess is the problem that findInterval(Values$Day, unique_quarters) returns
1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 5 4 5
While Values$Quart has values
0.25 0.50 0.75 1.00 1.25 1.50 1.75 2.00
try this:
library(tidyverse)
as.tbl(Values) %>%
mutate(Int=cut(Day, seq(0,3,0.25), include.lowest = T)) %>%
mutate(Int2=factor(Int, labels = seq(0.25,1.25,0.25)))
# A tibble: 30 x 5
ID Day Value Int Int2
<fctr> <dbl> <dbl> <fctr> <fctr>
1 1118 0.00 7.6 [0,0.25] 0.25
2 1119 0.00 6.2 [0,0.25] 0.25
3 13PP 0.00 6.8 [0,0.25] 0.25
4 1118 0.14 7.1 [0,0.25] 0.25
5 1119 0.13 6.2 [0,0.25] 0.25
6 13PP 0.13 5.9 [0,0.25] 0.25
7 1118 0.20 6.8 [0,0.25] 0.25
8 1119 0.23 5.8 [0,0.25] 0.25
9 13PP 0.24 4.6 [0,0.25] 0.25
10 1118 0.27 6.5 (0.25,0.5] 0.5
# ... with 20 more rows

Finding Row number of Consecutive decreasing values in data

I need to detect from data the first element of the first sequence of length 5 of consecutively decreasing numbers. There is a similar post here but when I applied to my data it failed.
set.seed(201)
az <- c(sort(runif(10,0,0.9),decreasing = T),sort(runif(3,-0.3,0),decreasing = T),sort(runif(3,-0.3,0),decreasing = F),sort(runif(4,-0.3,0),decreasing = T),sort(runif(4,-0.3,0),decreasing = F),sort(runif(6,-0.3,0),decreasing = T))
tz <- seq(1,length(az))
df <- data.frame(tz,az=round(az,2))
In the figure above it would be somewhere around tz = 25.
The post says that this function need to improve and so far I cannot get my desired result!
getFirstBefore<-function(x,len){
r<-rle(sign(diff(x)))
n<-which(r$lengths>=len & r$values<0)
if(length(n)==0)
return(-1)
1+sum(r$lengths[seq_len(n[1]-1)])
}
df1 <- df%>%
mutate(cns_tz=getFirstBefore(az,5))
tz az cns_tz
#1 1 0.56 4
#2 2 0.55 4
#3 3 0.33 4
#4 4 0.33 4
#5 5 0.26 4
#6 6 0.15 4
#7 7 0.12 4
#8 8 0.09 4
#9 9 0.04 4
#10 10 0.04 4
#11 11 -0.10 4
#12 12 -0.12 4
#13 13 -0.16 4
#14 14 -0.16 4
#15 15 -0.14 4
#16 16 -0.14 4
#17 17 -0.13 4
#18 18 -0.15 4
#19 19 -0.22 4
#20 20 -0.30 4
#21 21 -0.12 4
#22 22 -0.12 4
#23 23 -0.11 4
#24 24 -0.07 4
#25 25 -0.05 4
#26 26 -0.09 4
#27 27 -0.10 4
#28 28 -0.15 4
#29 29 -0.17 4
#30 30 -0.22 4
I would sort every 5 consecutive values, and see if that matches with the unsorted data. Then find the first occurance of such a match:
set.seed(123)
test <- rnorm(100)
decr <- sapply(seq_along(test),function(x){all(sort(test[x:(x+5)],decreasing = T) == test[x:(x+5)])})
firstdecr <- min(which(decr)):(min(which(decr))+5)
plot(test)
lines(firstdecr, test[firstdecr], col="red")
Only flaw I can see if there are equal values in a 5 value epoch, but you could also test for that.
We can use rleid from data.table
library(data.table)
n <- 5
v1 <- setDT(df)[sign(az)<0, .I[which(.N==n)] , rleid(c(1, sign(diff(az))))]$V1[1L]
v1
#[1] 26
df[, cnz_tz := v1]
Or another option is shift with Reduce
setDT(df)[, cnz_tz := .I[Reduce(`&`, shift((az - shift(az, fill=az[1])) < 0,
0:4, type = "lead", fill=FALSE)) & sign(az) < 0][1]]
We can also use rleid in dplyr
library(dplyr)
v1 <- df %>%
group_by(rl= rleid(c(1, sign(diff(az))))) %>%
mutate(rn = sign(az) < 0 & n()==5) %>%
.$rn %>%
which() %>%
head(., 1)
v1
#[1] 26
df %>%
mutate(cnz_tz = v1)
My naive pure-dplyr approach would be to compute a rolling sum of signs for differences and identify rows where the next five diffs have a negative sign. I say "naive" because this solution does not use rle for detecting streaks.
library(dplyr)
diff_details <- df %>%
mutate(diff = c(0, diff(az)),
diff_sign = sign(diff),
rolling_signs = cumsum(diff_sign),
next_five = lead(rolling_signs, 5) - rolling_signs)
diff_details
#> tz az diff diff_sign rolling_signs next_five
#> 1 1 0.56 0.00 0 0 -4
#> 2 2 0.55 -0.01 -1 -1 -4
#> 3 3 0.33 -0.22 -1 -2 -4
#> 4 4 0.33 0.00 0 -2 -5
#> 5 5 0.26 -0.07 -1 -3 -4
#> 6 6 0.15 -0.11 -1 -4 -4
#> 7 7 0.12 -0.03 -1 -5 -4
#> 8 8 0.09 -0.03 -1 -6 -4
#> 9 9 0.04 -0.05 -1 -7 -3
#> 10 10 0.04 0.00 0 -7 -2
#> 11 11 -0.10 -0.14 -1 -8 -1
#> 12 12 -0.12 -0.02 -1 -9 1
#> 13 13 -0.16 -0.04 -1 -10 1
#> 14 14 -0.16 0.00 0 -10 0
#> 15 15 -0.14 0.02 1 -9 -2
#> 16 16 -0.14 0.00 0 -9 -1
#> 17 17 -0.13 0.01 1 -8 -2
#> 18 18 -0.15 -0.02 -1 -9 0
#> 19 19 -0.22 -0.07 -1 -10 2
#> 20 20 -0.30 -0.08 -1 -11 4
#> 21 21 -0.12 0.18 1 -10 2
#> 22 22 -0.12 0.00 0 -10 1
#> 23 23 -0.11 0.01 1 -9 -1
#> 24 24 -0.07 0.04 1 -8 -3
#> 25 25 -0.05 0.02 1 -7 -5
#> 26 26 -0.09 -0.04 -1 -8 NA
#> 27 27 -0.10 -0.01 -1 -9 NA
#> 28 28 -0.15 -0.05 -1 -10 NA
#> 29 29 -0.17 -0.02 -1 -11 NA
#> 30 30 -0.22 -0.05 -1 -12 NA
Instead of identifying streaks in a sequence, we look at a cumulative sum of the signs of the differences in rolling_signs. next_five computes the difference in rolling_signs over the next five rows. When next_five is -5, then the next five rows have decreasing changes.
(diff_details$next_five %in% -5) %>% which %>% max
#> [1] 25
Each of the steps/columns could be abstracted into a function, like:
cum_diff_signs <- function(xs, window) {
rolling_signs <- cumsum(sign(c(0, diff(xs))))
next_diffs <- dplyr::lead(rolling_signs, window) - rolling_signs
next_diffs
}
cum_diff_signs(df$az, 5)
#> [1] -4 -4 -4 -5 -4 -4 -4 -4 -3 -2 -1 1 1 0 -2 -1 -2 0 2 4 2 1 -1
#> [24] -3 -5 NA NA NA NA NA
(cum_diff_signs(df$az, 5) %in% -5) %>% which %>% max
#> [1] 25

Separate rows of data by 0 values in R

I have a large time series (in data frame form) (n=>6000) that looks like this:
time, precip
1 2005-09-30 11:45:00, 0.08
2 2005-09-30 23:45:00, 0.72
3 2005-10-01 11:45:00, 0.01
4 2005-10-01 23:45:00, 0.08
5 2005-10-02 11:45:00, 0.10
6 2005-10-02 23:45:00, 0.33
7 2005-10-03 11:45:00, 0.15
8 2005-10-03 23:45:00, 0.30
9 2005-10-04 11:45:00, 0.00
10 2005-10-04 23:45:00, 0.00
11 2005-10-05 11:45:00, 0.02
12 2005-10-05 23:45:00, 0.00
13 2005-10-06 11:45:00, 0.00
14 2005-10-06 23:45:00, 0.01
15 2005-10-07 11:45:00, 0.00
16 2005-10-07 23:45:00, 0.00
17 2005-10-08 11:45:00, 0.00
18 2005-10-08 23:45:00, 0.16
19 2005-10-09 11:45:00, 0.03
20 2005-10-09 23:45:00, 0.00
Each row has a time (YYYY-MM-DD HH:MM:SS, 12 hour timeseries) and a precipitation amount. I'd like to separate the data by storm events.
What I'd like to do is this:
1) adding a new column called "storm"
2) for each set of amount values separated by 0's, call it one storm.
For example...
Time, Precip, Storm
1 2005-09-30 11:45:00, 0.08, 1
2 2005-09-30 23:45:00, 0.72, 1
3 2005-10-01 11:45:00, 0.01, 1
4 2005-10-01 23:45:00, 0.08, 1
5 2005-10-02 11:45:00, 0.10, 1
6 2005-10-02 23:45:00, 0.33, 1
7 2005-10-03 11:45:00, 0.15, 1
8 2005-10-03 23:45:00, 0.30, 1
9 2005-10-04 11:45:00, 0.00
10 2005-10-04 23:45:00, 0.00
11 2005-10-05 11:45:00, 0.02, 2
12 2005-10-05 23:45:00, 0.00
13 2005-10-06 11:45:00, 0.00
14 2005-10-06 23:45:00, 0.01, 3
15 2005-10-07 11:45:00, 0.00
16 2005-10-07 23:45:00, 0.00
17 2005-10-08 11:45:00, 0.00
18 2005-10-08 23:45:00, 0.16, 4
19 2005-10-09 11:45:00, 0.03, 4
20 2005-10-09 23:45:00, 0.00
4) after that, my plan is to subset the data by storm event.
I am pretty new to R, so don't be afraid of pointing out the obvious. Your help would be much appreciated!
You can find the events within a storm then use rle and modify the results
# assuming your data is called rainfall
# identify whether a precipitation has been recorded at each timepoint
rainfall$storm <- rainfall$precip > 0
# do run length encoding on this storm indicator
storms < rle(rainfall$storms)
# set the FALSE values to NA
is.na(storms$values) <- !storms$values
# replace the TRUE values with a number in seqence
storms$values[which(storms$values)] <- seq_len(sum(storms$values, na.rm = TRUE))
# use inverse.rle to revert to the full length column
rainfall$stormNumber <- inverse.rle(storms)
Assuming this input:
Lines <- "time, precip
1 2005-09-30 11:45:00, 0.08
2 2005-09-30 23:45:00, 0.72
3 2005-10-01 11:45:00, 0.01
4 2005-10-01 23:45:00, 0.08
5 2005-10-02 11:45:00, 0.10
6 2005-10-02 23:45:00, 0.33
7 2005-10-03 11:45:00, 0.15
8 2005-10-03 23:45:00, 0.30
9 2005-10-04 11:45:00, 0.00
10 2005-10-04 23:45:00, 0.00
11 2005-10-05 11:45:00, 0.02
12 2005-10-05 23:45:00, 0.00
13 2005-10-06 11:45:00, 0.00
14 2005-10-06 23:45:00, 0.01
15 2005-10-07 11:45:00, 0.00
16 2005-10-07 23:45:00, 0.00
17 2005-10-08 11:45:00, 0.00
18 2005-10-08 23:45:00, 0.16
19 2005-10-09 11:45:00, 0.03
20 2005-10-09 23:45:00, 0.00
"
We read in the data and then create a logical vector that is TRUE for each non-zero precip for which the prior value is zero. We prepend the first value which is TRUE if z[1] is non-zero and FALSE if zero. Applying cumsum to this vector gives the correct values in positions corresponding to non-zero precip values. To handle the values whose positions correspond to zero precip values we use replace to store empty into them:
# read in data
library(zoo)
z <- read.zoo(text = Lines, skip = 1, tz = "", index = 2:3)[, 2]
# calculate
e <- NA # empty
cbind(precip = z, storm = replace(cumsum(c(z[1]!=0, z!=0 & lag(z,-1)==0)), z==0, e))
The last line gives this:
precip storm
2005-09-30 11:45:00 0.08 1
2005-09-30 23:45:00 0.72 1
2005-10-01 11:45:00 0.01 1
2005-10-01 23:45:00 0.08 1
2005-10-02 11:45:00 0.10 1
2005-10-02 23:45:00 0.33 1
2005-10-03 11:45:00 0.15 1
2005-10-03 23:45:00 0.30 1
2005-10-04 11:45:00 0.00 NA
2005-10-04 23:45:00 0.00 NA
2005-10-05 11:45:00 0.02 2
2005-10-05 23:45:00 0.00 NA
2005-10-06 11:45:00 0.00 NA
2005-10-06 23:45:00 0.01 3
2005-10-07 11:45:00 0.00 NA
2005-10-07 23:45:00 0.00 NA
2005-10-08 11:45:00 0.00 NA
2005-10-08 23:45:00 0.16 4
2005-10-09 11:45:00 0.03 4
2005-10-09 23:45:00 0.00 NA

Resources