Using row index number to calculate values - r

I'm having trouble using the row number as index. For example I want a new column that will give me the sales taking into account the next 4 days. I want to create column name:sale_next 4
The issue with my code is that I don't know how to make the index of the row_number() variable, since what I'm doing is fetching the actual value of the column.
#heres to create the data
df <- read.table(text = "day price price_change sales High_sales_ind
1 5 0 12 1
2 5 0 6 0
3 5 0 5 0
4 5 0 4 0
5 5 0 10 1
6 5 0 10 1
7 5 0 10 1
8 5 0 12 1
9 5 0 14 1
10 7 2 3 0
11 7 0 2 0", header = TRUE)
#my code
df<- df %>% mutate(sales_next4 = sales[row_number():sales_rownumber()+4)
What I need:
day
price
price_change
sales
High_sales_ind
sales_next4
1
5
0
12
1
27
2
5
0
6
0
25
3
5
0
5
0
29
4
5
0
4
0
34
5
5
0
10
1
42
6
5
0
10
1
46
7
5
0
10
1
39
8
5
0
12
1
31
9
5
0
14
1
19
10
7
2
3
0
5
11
7
0
2
0
2
Any help would be appreciated.

You can use rollapply from the zoo package for cases like this, assuming that the days are consecutive as in the example data provided.
You'll need to use the partial = and align = arguments to fill the column correctly, see ?rollapply for the details.
library(dplyr)
library(zoo)
df <- df %>%
mutate(sales_next4 = rollapply(sales, 4, sum, partial = TRUE, align = "left"))
Result:
day price price_change sales High_sales_ind sales_next4
1 1 5 0 12 1 27
2 2 5 0 6 0 25
3 3 5 0 5 0 29
4 4 5 0 4 0 34
5 5 5 0 10 1 42
6 6 5 0 10 1 46
7 7 5 0 10 1 39
8 8 5 0 12 1 31
9 9 5 0 14 1 19
10 10 7 2 3 0 5
11 11 7 0 2 0 2

You can use map() from purrr to do rolling sum depending on the day column.
library(dplyr)
library(purrr)
df %>%
mutate(sales_next4 = map_dbl(day, ~ sum(sales[between(day, .x, .x+3)])))
# day price price_change sales High_sales_ind sales_next4
# 1 1 5 0 12 1 27
# 2 2 5 0 6 0 25
# 3 3 5 0 5 0 29
# 4 4 5 0 4 0 34
# 5 5 5 0 10 1 42
# 6 6 5 0 10 1 46
# 7 7 5 0 10 1 39
# 8 8 5 0 12 1 31
# 9 9 5 0 14 1 19
# 10 10 7 2 3 0 5
# 11 11 7 0 2 0 2

Using slider
library(dplyr)
library(slider)
df %>%
mutate(sales_next4 = slide_dbl(day, ~ sum(sales[.x]), .after = 3))
day price price_change sales High_sales_ind sales_next4
1 1 5 0 12 1 27
2 2 5 0 6 0 25
3 3 5 0 5 0 29
4 4 5 0 4 0 34
5 5 5 0 10 1 42
6 6 5 0 10 1 46
7 7 5 0 10 1 39
8 8 5 0 12 1 31
9 9 5 0 14 1 19
10 10 7 2 3 0 5
11 11 7 0 2 0 2

You can use Reduce() and data.table::shift()
library(data.table)
setDT(df)[, n4:=Reduce(`+`,shift(c(sales,0,0,0),-3:0))[1:.N]]
Output:
day price price_change sales High_sales_ind sales_next4
1 1 5 0 12 1 27
2 2 5 0 6 0 25
3 3 5 0 5 0 29
4 4 5 0 4 0 34
5 5 5 0 10 1 42
6 6 5 0 10 1 46
7 7 5 0 10 1 39
8 8 5 0 12 1 31
9 9 5 0 14 1 19
10 10 7 2 3 0 5
11 11 7 0 2 0 2
or, could this as part of dplyr/mutate pipeline
mutate(df, sales_next4 = Reduce(`+`, data.table::shift(c(sales,0,0,0),0:-3))[1:nrow(df)])

Related

Count consecutive numbers

I have some time series with corresponding number for each date as 0 or 1. For example:
date value
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 1
So I want to count the consecutive 1´s like for date 3-5 the sum should be 3 and then start at date 7 again to count. And if this sum is below 6 the 1´s should be transformed to 0´s.
library(dplyr)
data.frame(
date = 1:8,
value = c(0,0,1,1,1,0,1,1)
) %>%
mutate(
count = rle(value) %>%
{list(.$lengths * .$values, .$lengths)} %>%
{rep(x = .[[1]], times = .[[2]])},
count_1 = ifelse(count < 6, 0, count)
)
gives:
date value count count_1
1 1 0 0 0
2 2 0 0 0
3 3 1 3 0
4 4 1 3 0
5 5 1 3 0
6 6 0 0 0
7 7 1 2 0
8 8 1 2 0
I would first create a grouping variable and then use this to aggregate the dataset.
d = data.frame("date"=1:12,
"value"=c(1,1,0,0,1,1,1,1,0,0,1,0))
d$group = 1
for(i in 2:dim(d)[1]){
if(d$value[i]==d$value[i-1]){
d$group[i]=d$group[i-1]
} else {
d$group[i]=d$group[i-1]+1
}
}
nd = data.frame("Group"=unique(d$group),
"Start"=aggregate(d$date~d$group,FUN=min)[,2],
"End"=aggregate(d$date~d$group,FUN=max)[,2],
"Count"=aggregate(d$value~d$group,FUN=sum)[,2])
The output for this data would be:
> d ## Input data
date value
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 1
8 8 1
9 9 0
10 10 0
11 11 1
12 12 0
> nd ## All groups
Group Start End Count
1 1 1 2 2
2 2 3 4 0
3 3 5 8 4
4 4 9 10 0
5 5 11 11 1
6 6 12 12 0
> nd[nd$Count>0,] ## Just the groups with 1 in them:
Group Start End Count
1 1 1 2 2
3 3 5 8 4
5 5 11 11 1
Another solution which looks like what you expected :
d = data.frame("date"=1:20,"value"=c(1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0))
repl <- rle(d$value)
rep_lengths <- rep(repl$lengths, repl$lengths)
rep_lengths[rep_lengths < 6] <- 0
d$value <- rep_lengths
returns
> d
date value
1 1 0
2 2 0
3 3 0
4 4 0
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 7
14 14 7
15 15 7
16 16 7
17 17 7
18 18 7
19 19 7
20 20 0
You can use rle to count the consecutive and use ifelse to set those lower 6 to 0:
y <- rle(x$value)
y[[2]] <- y[[1]] * y[[2]]
y[[2]] <- ifelse(y[[2]] < 6, 0, y[[2]])
inverse.rle(y)
#[1] 0 0 0 0 0 0 0 0
Data:
x <- data.frame(date = 1:8, value = c(0,0,1,1,1,0,1,1))

Suitable alternative for ddply function

How can I convert the following tibble to the final result posted below using dplyr?
> group_by(hth, team) %>% arrange(team)
Source: local data frame [26 x 14]
Groups: team [13]
team CSK DC DD GL KKR KTK KXIP MI PW RCB RPSG
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSK 0 8 11 0 11 2 9 10 4 10 0
2 CSK 0 2 5 0 5 0 8 12 2 9 0
3 DC 2 0 8 0 2 1 7 5 3 8 0
4 DC 8 0 3 0 7 0 3 5 1 3 0
5 DD 5 3 0 0 7 2 8 5 2 10 2
6 DD 11 8 0 2 10 0 10 13 4 7 0
7 GL 0 0 2 0 0 0 0 0 0 1 0
8 GL 0 0 0 0 2 0 2 2 0 2 2
9 KKR 5 7 10 2 0 0 5 10 3 15 0
10 KKR 11 2 7 0 0 2 14 8 2 3 2
# ... with 16 more rows, and 2 more variables: RR <dbl>, SH <dbl>
>
I used plyr's ddply function and was able to achieve the result.
> ddply(hth, .(team), function(x) colSums(x[,-1], na.rm = TRUE))
team CSK DC DD GL KKR KTK KXIP MI PW RCB RPSG RR SH
1 CSK 0 10 16 0 16 2 17 22 6 19 0 17 6
2 DC 10 0 11 0 9 1 10 10 4 11 0 9 0
3 DD 16 11 0 2 17 2 18 18 6 17 2 16 8
4 GL 0 0 2 0 2 0 2 2 0 3 2 0 3
5 KKR 16 9 17 2 0 2 19 18 5 18 2 15 9
6 KTK 2 1 2 0 2 0 1 1 1 2 0 2 0
7 KXIP 17 10 18 2 19 1 0 18 6 18 2 15 8
8 MI 22 10 18 2 18 1 18 0 6 19 2 16 8
9 PW 6 4 6 0 5 1 6 6 0 5 0 5 2
10 RCB 19 11 17 3 18 2 18 19 5 0 2 16 9
11 RPSG 0 0 2 2 2 0 2 2 0 2 0 0 2
12 RR 17 9 16 0 15 2 15 16 5 16 0 0 7
13 SH 6 0 8 3 9 0 8 8 2 9 2 7 0
>
How to achieve the same using just dplyr functions?
Looks like you are grouping by team and summing the columns, in dplyr:
library(dplyr)
hth %>%
group_by(team) %>%
summarise_all(funs(sum), na.rm = TRUE)

How to do calculation based on previous row results in dplyr

I performing some calculations where the result of a row is the input to the next.
I'm using a for loop which is quite slow, is there a way I can use dplyr for these types of calculations? example below
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(receipts = 0) %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
> dfb
beginning_on_hand sales ship receipts ending_on_hand
10 10 10 0 0
0 9 9 0 0
0 4 4 10 0
0 7 7 9 0
0 3 3 4 0
0 7 7 7 0
0 2 2 3 0
0 6 6 7 0
0 1 1 2 0
0 5 5 6 0
0 7 7 1 0
0 1 1 5 0
for(i in 1:(nrow(dfb)- 2)) {
dfb$ending_on_hand[i] <- dfb$beginning_on_hand[i] + dfb$receipts[i] - dfb$sales[i]
dfb$beginning_on_hand[i+1] <- dfb$ending_on_hand[i]
}
> dfb
beginning_on_hand sales ship receipts ending_on_hand
1 10 10 10 0 0
2 0 9 9 0 -9
3 -9 4 4 10 -3
4 -3 7 7 9 -1
5 -1 3 3 4 0
6 0 7 7 7 0
7 0 2 2 3 1
8 1 6 6 7 2
9 2 1 1 2 3
10 3 5 5 6 4
11 4 7 7 1 0
12 0 1 1 5 0
I don't have a dplyr solution for this, but here is a data.table solution for this.
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
dfb<-data.table(dfb)
df.end <- dfb[, ending_on_hand := cumsum(beginning_on_hand + receipts - sales)][,
beginning_on_hand := beginning_on_hand + lag(ending_on_hand, default = 0)]
>df.end
beginning_on_hand sales ship ending_on_hand receipts
1: 10 10 10 0 0
2: 0 9 9 -9 0
3: -9 4 4 -3 10
4: -3 7 7 -1 9
5: -1 3 3 0 4
6: 0 7 7 0 7
7: 0 2 2 1 3
8: 1 6 6 2 7
9: 2 1 1 3 2
10: 3 5 5 4 6
11: 4 7 7 -2 1
12: -2 1 1 2 5
To explain, data.table uses basically lists to comprise the data and displays it in typically a flat-file manner. It uses SQL type instructions to organize and process data. The functions of note used here are cumsum and lag. cumsum calculates all values prior to a particular index, and lag looks for a value above or prior to a given index.

Removing the unordered pairs repeated twice in a file in R

I have a file like this in R.
**0 1**
0 2
**0 3**
0 4
0 5
0 6
0 7
0 8
0 9
0 10
**1 0**
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
**3 0**
As we can see, there are similar unordered pairs in this ( marked pairs ), like,
1 0
and
0 1
I wish to remove these pairs. And I want to count the number of such pairs that I have and append the count in front of the tow that is repeated. If not repeated, then 1 should be written in the third column.
For example ( A sample of the output file )
0 1 2
0 2 1
0 3 2
0 4 1
0 5 1
0 6 1
0 7 1
0 8 1
0 9 1
0 10 1
1 11 1
1 12 1
1 13 1
1 14 1
1 15 1
1 16 1
1 17 1
1 18 1
1 19 1
How can I achieve it in R?
Here is a way using transform, pmin and pmax to reorder the data by row, and then aggregate to provide a count:
# data
x <- data.frame(a=c(rep(0,10),rep(1,10),3),b=c(1:10,0,11:19,0))
#logic
aggregate(count~a+b,transform(x,a=pmin(a,b), b=pmax(a,b), count=1),sum)
a b count
1 0 1 2
2 0 2 1
3 0 3 2
4 0 4 1
5 0 5 1
6 0 6 1
7 0 7 1
8 0 8 1
9 0 9 1
10 0 10 1
11 1 11 1
12 1 12 1
13 1 13 1
14 1 14 1
15 1 15 1
16 1 16 1
17 1 17 1
18 1 18 1
19 1 19 1
Here's one approach:
First, create a vector of the columns sorted and then pasted together.
x <- apply(mydf, 1, function(x) paste(sort(x), collapse = " "))
Then, use ave to create the counts you are looking for.
mydf$count <- ave(x, x, FUN = length)
Finally, you can use the "x" vector again, this time to detect and remove duplicated values.
mydf[!duplicated(x), ]
# V1 V2 count
# 1 0 1 2
# 2 0 2 1
# 3 0 3 2
# 4 0 4 1
# 5 0 5 1
# 6 0 6 1
# 7 0 7 1
# 8 0 8 1
# 9 0 9 1
# 10 0 10 1
# 12 1 11 1
# 13 1 12 1
# 14 1 13 1
# 15 1 14 1
# 16 1 15 1
# 17 1 16 1
# 18 1 17 1
# 19 1 18 1
# 20 1 19 1

cumulative counter in dataframe R

I have a dataframe with many rows, but the structure looks like this:
year factor
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 1
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 1
18 0
19 0
20 0
I would need to add a counter as a third column. It should count the cumulative cells that contains zero until it set again to zero once the value 1 is encountered. The result should look like this:
year factor count
1 0 0
2 0 1
3 0 2
4 0 3
5 0 4
6 0 5
7 0 6
8 0 7
9 1 0
10 0 1
11 0 2
12 0 3
13 0 4
14 0 5
15 0 6
16 0 7
17 1 0
18 0 1
19 0 2
20 0 3
I would be glad to do it in a quick way, avoiding loops, since I have to do the operations for hundreds of files.
You can copy my dataframe, pasting the dataframe in "..." here:
dt <- read.table( text="...", , header = TRUE )
Perhaps a solution like this with ave would work for you:
A <- cumsum(dt$factor)
ave(A, A, FUN = seq_along) - 1
# [1] 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3
Original answer:
(Missed that the first value was supposed to be "0". Oops.)
x <- rle(dt$factor == 1)
y <- sequence(x$lengths)
y[dt$factor == 1] <- 0
y
# [1] 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 0 1 2 3

Resources