How to fix the first row on repeated measures - r

I'm trying to translate a SAS code into R but I don't know how to translate the follow SAS code below:
data df; by id area;
if first.area and area = 'A' then do;
var1_sum = 0;
var2_sum = 0;
end;
if indicator = 'A' then do;
var1_sum + var1;
var2_sum + var2;
end;
From the dataset before:
id area var1 var2
1 A 9 9
1 A 4 8
1 A 5 2
1 B 1 4
1 B 8 5
1 B 0 6
1 C 3 7
1 C 2 8
We get the follow result when the SAS code above it's used:
id area var1 var2 var1_sum var2_sum
1 A 9 9 9 9
1 A 4 8 13 17
1 A 5 2 18 19
1 B 1 4 1 4
1 B 8 5 9 9
1 B 0 6 9 15
1 C 3 7 3 7
1 C 2 8 5 15
I am using dplyr to code in R. So, I have started "a portion" of the R code which I am trying to translate, but I don't know how to code the "if condition" stated in SAS:
df <- df %>%
group_by(id, area) %>%
.....
I am looking for help how to include the "if condition" for this case.
Thank you for your help.
Kind regards,
Rungo.

You can do this in base R with ave
## Your data
df = read.table(text="id area var1 var2
1 A 9 9
1 A 4 8
1 A 5 2
1 B 1 4
1 B 8 5
1 B 0 6
1 C 3 7
1 C 2 8",
header=TRUE)
df$var1_sum = ave(df$var1, df$id, df$area, FUN=cumsum)
df$var2_sum = ave(df$var2, df$id, df$area, FUN=cumsum)
df
id area var1 var2 var1_sum var2_sum
1 1 A 9 9 9 9
2 1 A 4 8 13 17
3 1 A 5 2 18 19
4 1 B 1 4 1 4
5 1 B 8 5 9 9
6 1 B 0 6 9 15
7 1 C 3 7 3 7
8 1 C 2 8 5 15

Using tidyverse approach you may use the following code:
mydf <- read.table(text="id area var1 var2
1 A 9 9
1 A 4 8
1 A 5 2
1 B 1 4
1 B 8 5
1 B 0 6
1 C 3 7
1 C 2 8",
header=TRUE)
library(tidyverse)
mydf %>%
group_by(id,area) %>%
mutate(var1sum=cumsum(var1),
var2sum=cumsum(var2))
The result is:
id area var1 var2 var1sum var2sum
<int> <fctr> <int> <int> <int> <int>
1 1 A 9 9 9 9
2 1 A 4 8 13 17
3 1 A 5 2 18 19
4 1 B 1 4 1 4
5 1 B 8 5 9 9
6 1 B 0 6 9 15
7 1 C 3 7 3 7
8 1 C 2 8 5 15

Related

Sum values incrementally for panel data

I have a very basic question as I am relatively new to R. I was wondering how to add a value in a particular column to the previous one for each cross-sectional unit in my data separately. My data looks like this:
firm date value
A 1 10
A 2 15
A 3 20
A 4 0
B 1 0
B 2 1
B 3 5
B 4 10
C 1 3
C 2 2
C 3 10
C 4 1
D 1 7
D 2 3
D 3 6
D 4 9
And I want to achieve the data below. So I want to sum values for each cross-sectional unit incrementally.
firm date value cumulative value
A 1 10 10
A 2 15 25
A 3 20 45
A 4 0 45
B 1 0 0
B 2 1 1
B 3 5 6
B 4 10 16
C 1 3 3
C 2 2 5
C 3 10 15
C 4 1 16
D 1 7 7
D 2 3 10
D 3 6 16
D 4 9 25
Below is a reproducible example code. I tried lag() but couldn't figure out how to repeat it for each firm.
firm <- c("A","A","A","A","B","B","B","B","C","C","C", "C","D","D","D","D")
date <- c("1","2","3","4","1","2","3","4","1","2","3","4", "1", "2", "3", "4")
value <- c(10, 15, 20, 0, 0, 1, 5, 10, 3, 2, 10, 1, 7, 3, 6, 9)
data <- data.frame(firm = firm, date = date, value = value)
Does this work:
library(dplyr)
df %>% group_by(firm) %>% mutate(cumulative_value = cumsum(value))
# A tibble: 16 x 4
# Groups: firm [4]
firm date value cumulative_value
<chr> <int> <int> <int>
1 A 1 10 10
2 A 2 15 25
3 A 3 20 45
4 A 4 0 45
5 B 1 0 0
6 B 2 1 1
7 B 3 5 6
8 B 4 10 16
9 C 1 3 3
10 C 2 2 5
11 C 3 10 15
12 C 4 1 16
13 D 1 7 7
14 D 2 3 10
15 D 3 6 16
16 D 4 9 25
Using base R with ave
data$cumulative_value <- with(data, ave(value, firm, FUN = cumsum))
-output
> data
firm date value cumulative_value
1 A 1 10 10
2 A 2 15 25
3 A 3 20 45
4 A 4 0 45
5 B 1 0 0
6 B 2 1 1
7 B 3 5 6
8 B 4 10 16
9 C 1 3 3
10 C 2 2 5
11 C 3 10 15
12 C 4 1 16
13 D 1 7 7
14 D 2 3 10
15 D 3 6 16
16 D 4 9 25

How to create nested for loop for a certain range

I am looking to add rows to my data frame over a range of numbers.
I'm not great at loops so take a look if it helps:
k=1
for (i in 1:nrow(Data)){
for (l in 1: Data[i,2]){
for (j in 1: Data[i,5]){
Data[k,]<-Data[i,]
Data[k,3]<-Data[i,3]+j-1
k=k+1}
}
}
Here is a sample dataframe:
Data<- data.frame(matrix(NCOs = 4, nrow = 2))
x <- c("gid","did", "pid","plays")
colnames(Data) <- x
Data[1,]<-c(1,1,2,8)
Data[2,]<-c(1,2,12,6)
Output should have a total of 14 rows and looks like this
1 1 2 8
1 1 3 8
1 1 4 8
.
.
.
.
1 2 12 6
1 2 13 6
.
.
1 2 17 6
Maybe tidyr::complete is a good choice here.
library(dplyr)
library(tidyr)
Data %>%
group_by(gid, did) %>%
complete(pid = seq(pid, pid + plays - 1)) %>%
fill(plays)
# gid did pid plays
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 8
# 2 1 1 3 8
# 3 1 1 4 8
# 4 1 1 5 8
# 5 1 1 6 8
# 6 1 1 7 8
# 7 1 1 8 8
# 8 1 1 9 8
# 9 1 2 12 6
#10 1 2 13 6
#11 1 2 14 6
#12 1 2 15 6
#13 1 2 16 6
#14 1 2 17 6

numbering duplicated rows in dplyr [duplicate]

This question already has answers here:
Using dplyr to get cumulative count by group
(3 answers)
Closed 5 years ago.
I come to an issue with numbering the duplicated rows in data.frame and could not find a similar post.
Let's say we have a data like this
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
> df
gr x
1 1 a
2 1 a
3 2 b
4 2 b
5 3 c
6 3 c
7 4 a
8 4 a
9 5 c
10 5 c
11 6 d
12 6 d
13 7 a
14 7 a
and want to add new column called x_dupl to show that first occurrence of x values is numbered as 1 and second time 2 and third time 3 and so on..
thanks in advance!
The expected output
> df
gr x x_dupl
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
Your example data (plus rows where gr = 7 as in your output), and named df1, not df:
df1 <- data.frame(gr = gl(7,2),
x = c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
library(dplyr)
df1 %>%
group_by(x) %>%
mutate(x_dupl = dense_rank(gr)) %>%
ungroup()
# A tibble: 14 x 3
gr x x_dupl
<fctr> <fctr> <int>
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
A base R solution:
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
x <- rle(as.numeric(df$x))
x$values <- ave(x$values, x$values, FUN = seq_along)
df$x_dupl <- inverse.rle(x)
# gr x x_dupl
# 1 1 a 1
# 2 1 a 1
# 3 2 b 1
# 4 2 b 1
# 5 3 c 1
# 6 3 c 1
# 7 4 a 2
# 8 4 a 2
# 9 5 c 2
# 10 5 c 2
# 11 6 d 1
# 12 6 d 1
# 13 7 a 3
# 14 7 a 3

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Create a block column based on id and the value of another column in R

Given the following first two columns(id and time_diff), i want to generate the 'block' column
test
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5
The data is already sorted by id and time. The time_diff was computed based on the difference of the previous time and the time value for the row, given the same id. I want to create a block id which is an auto-increment value and increases when a new ID or a time_diff of >10 with the same id is encountered.
How can I achieve this in R?
Importing your data as a data frame with something like:
df = read.table(text='
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5')
You can do a one-liner like this to get occurrences satisfying your two conditions:
> new_col = as.vector(cumsum(
na.exclude(
c(F,diff(as.numeric(as.factor(df$id)))) | # change of id OR
df$time_diff > 10 # time_diff greater than 10
)
))
> new_col
[1] 0 0 0 0 0 1 2 2 2 2 3 3 4 4 4
And finally append this new column to your dataframe with cbind:
> cbind(df, block = c(0,new_col))
id time_diff block block
1 a NA 1 0
2 a 1 1 0
3 a 1 1 0
4 a 1 1 0
5 a 3 1 0
6 a 3 1 0
7 b NA 2 1
8 b 11 3 2
9 b 1 3 2
10 b 1 3 2
11 b 1 3 2
12 b 12 4 3
13 b 1 4 3
14 c NA 5 4
15 c 4 5 4
16 c 7 5 4
You will notice an offset between your wanted block variable and mine: correcting it is easy and can be done at several different step, I will leave it to you :)
Another variation of #Jealie's method would be:
with(test, cumsum(c(TRUE,id[-1]!=id[-nrow(test)])|time_diff>10))
#[1] 1 1 1 1 1 1 2 3 3 3 3 4 4 5 5 5
After learning from Jealie and akrun, I came up with this idea.
mydf %>%
mutate(group = cumsum(time_diff > 10 |!duplicated(id)))
# id time_diff block group
#1 a NA 1 1
#2 a 1 1 1
#3 a 1 1 1
#4 a 1 1 1
#5 a 3 1 1
#6 a 3 1 1
#7 b NA 2 2
#8 b 11 3 3
#9 b 1 3 3
#10 b 1 3 3
#11 b 1 3 3
#12 b 12 4 4
#13 b 1 4 4
#14 c NA 5 5
#15 c 4 5 5
#16 c 7 5 5
Here is an approach using dplyr:
require(dplyr)
set.seed(999)
test <- data.frame(
id = rep(letters[1:4], each = 3),
time_diff = sample(4:15)
)
test %>%
mutate(
b = as.integer(id) - lag(as.integer(id)),
more10 = time_diff > 10,
increment = pmax(b, more10, na.rm = TRUE),
increment = ifelse(row_number() == 1, 1, increment),
block = cumsum(increment)
) %>%
select(id, time_diff, block)
Try:
> df
id time_diff
1 a NA
2 a 1
3 a 1
4 a 1
5 a 3
6 a 3
7 b NA
8 b 11
9 b 1
10 b 1
11 b 1
12 b 12
13 b 1
14 c NA
15 c 4
16 c 7
block= c(1)
for(i in 2:nrow(df))
block[i] = ifelse(df$time_diff[i]>10 || df$id[i]!=df$id[i-1],
block[i-1]+1,
block[i-1])
df$block = block
df
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5

Resources