Subtract specific rows - r

I have data that looks the following way:
Participant Round Total
1 100 5
1 101 8
1 102 12
1 200 42
2 100 14
2 101 71
40 100 32
40 101 27
40 200 18
I want to get a table with the Total of last Round (200) minus the Total of first Round (100) ;
For example - for Participant 1 - it is 42 - 5 = 37.
The final output should look like:
Participant Total
1 37
2
40 -14

With base R
aggregate(Total ~ Participant, df[df$Round %in% c(100, 200), ], diff)
# Participant Total
# 1 1 37
# 2 2
# 3 40 -14
Or similarly combined with subset
aggregate(Total ~ Participant, df, subset = Round %in% c(100, 200), diff)
Or with data.table
library(data.table) ;
setDT(df)[Round %in% c(100, 200), diff(Total), by = Participant]
# Participant V1
# 1: 1 37
# 2: 40 -14
Or using binary join
setkey(setDT(df), Round)
df[.(c(100, 200)), diff(Total), by = Participant]
# Participant V1
# 1: 1 37
# 2: 40 -14
Or with dplyr
library(dplyr)
df %>%
group_by(Participant) %>%
filter(Round %in% c(100, 200)) %>%
summarise(Total = diff(Total))
# Source: local data table [2 x 2]
#
# Participant Total
# 1 1 37
# 2 40 -14

you can try this
library(dplyr)
group_by(df, Participant) %>%
filter(row_number()==1 | row_number()==max(row_number())) %>%
mutate(df = diff(Total)) %>%
select(Participant, df) %>%
unique()
Source: local data frame [3 x 2]
Groups: Participant
Participant df
1 1 37
2 2 57
3 40 -14

try this:
df <- read.table(header = TRUE, text = "
Participant Round Total
1 100 5
1 101 8
1 102 12
1 200 42
2 100 14
2 101 71
2 200 80
40 100 32
40 101 27
40 200 18")
library(data.table)
setDT(df)[ , .(Total = Total[Round == 200] - Total[Round == 100]), by = Participant]

Everyone loves a bit of sqldf, so if your requirement isn't to use apply then try this:
Firstly some test data:
df <- read.table(header = TRUE, text = "
Participant Round Total
1 100 5
1 101 8
1 102 12
1 200 42
2 100 14
2 101 71
2 200 80
40 100 32
40 101 27
40 200 18")
Next use SQL to create 2 columns - one for the 100 round and one for the 200 round and subtract them
rolled <- sqldf("
SELECT tab_a.Participant AS Participant
,tab_b.Total_200 - tab_a.Total_100 AS Difference
FROM (
SELECT Participant
,Total AS Total_100
FROM df
WHERE Round = 100
) tab_a
INNER JOIN (
SELECT Participant
,Total AS Total_200
FROM df
WHERE Round = 200
) tab_b ON (tab_a.Participant = tab_b.Participant)
")

Related

Calculate difference between rows in repeated measurement database R

I have a data-frame like this:
ID Time Testscore
20 2 300
20 1 350
20 3 -150
30 2 200
30 1 100
40 1 300
40 2 NA
Three questions:
How can I calculate the difference between last score and first score grouped by ID and Time whereas the last time is the bigger number (some with more repeated measures than other)
How to deal with NA in calculation
Is there a way to arrange the Time varible in ascending ordered and keep the ID grouped up?
Thanks for the help.
Using tapply
with(dat, tapply(Testscore, ID, \(x) x[length(x)] - x[1]))
# 20 30 40
# -450 -100 NA
or ave.
transform(dat, d=ave(Testscore, ID, FUN=\(x) x[length(x)] - x[1]))
# ID Time Testscore d
# 1 20 2 300 -450
# 2 20 1 350 -450
# 3 20 3 -150 -450
# 4 30 2 200 -100
# 5 30 1 100 -100
# 6 40 1 300 NA
# 7 40 2 NA NA
Here by ID and Time, but doesn't make much sense with your sample data.
with(dat, tapply(Testscore, list(ID, Time), \(x) x[length(x)] - x[1]))
transform(dat, d=ave(Testscore, ID, Time, FUN=\(x) x[length(x)] - x[1]))
Sorting dataframe rows is usually done with the order function.
dfrm <- dfrm[ order(dfrm$ID, dfrm$Time) , ]
Then you can use split in traditional R or group_by in the tidyverse to separately handle the difference calculations.
diffs <- sapply( split(dfrm, dfrm$ID), function(grp){
grp[ max(grp$Time, na.rm=TRUE), "Testscore"] -
grp[ min(grp$Time, na.rm=TRUE), "Testscore"] }
diffs
#---------------
20 30 40
-500 100 NA
I didn't see a request to put these differences along side the dataframe.
Using dplyr:
df %>%
arrange(ID, Time) %>%
group_by(ID) %>%
mutate(Diff = last(Testscore) - first(Testscore))
# A tibble: 7 × 4
# Groups: ID [3]
# ID Time Testscore Diff
# <dbl> <dbl> <dbl> <dbl>
# 1 20 1 350 -500
# 2 20 2 300 -500
# 3 20 3 -150 -500
# 4 30 1 100 100
# 5 30 2 200 100
# 6 40 1 300 NA
# 7 40 2 NA NA

Subtract values within group

I have a dataframe:
set.seed(42)
ID <- sample(1:15, 100, replace = TRUE)
value <- sample(1:4, 100, replace = TRUE)
d <- data.frame(ID, value)
I want to group by ID, and create a new column where each value is subtracted from all others within the group.
Like sum add all of these values into a single column, how do I subtract?
library(dplyr)
d %>%
group_by(ID) %>%
# what's the - equivalent!
mutate(value_c = sub(value))
Thanks
J
Well, its a somewhat odd calculation, but slightly to my own surprise, the following seems to do what you explain:
set.seed(42)
ID <- sample(1:15, 100, replace = TRUE)
value <- sample(1:4, 100, replace = TRUE)
d <- data.frame(ID, value)
d %>% group_by( ID ) %>%
mutate(
value_c = value*2 - sum(value)
) %>%
arrange( ID ) %>%
head( n=20 )
Produces:
# A tibble: 20 x 3
# Groups: ID [3]
ID value value_c
<int> <int> <dbl>
1 1 1 -12
2 1 1 -12
3 1 4 -6
4 1 1 -12
5 1 1 -12
6 1 2 -10
7 1 4 -6
8 2 4 -21
9 2 3 -23
10 2 3 -23
11 2 2 -25
12 2 1 -27
13 2 1 -27
14 2 3 -23
15 2 3 -23
16 2 1 -27
17 2 4 -21
18 2 4 -21
19 3 4 -8
20 3 4 -8
You multiply value by 2 because its going to be in the sum() anyway, which you didn't want, so adding it back on the left side takes care of that.
Here is a base R option using ave
transform(
d,
value_c = 2*value - ave(value,ID,FUN = sum)
)
An option with data.table
library(data.table)
setDT(d)[, value_c := 2 * value - sum(value), ID]

Dpylr solution for cumsum with a factor reset

I need a dpylr solution that creates a cumsum column.
# Input dataframe
df <- data.frame(OilChanged = c("No","No","Yes","No","No","No","No","No","No","No","No","Yes","No"),
Odometer = c(300,350,410,420,430,450,500,600,600,600,650,660,700))
# Create difference column - first row starting with zero
df <- df %>% dplyr::mutate(Odometer_delta = Odometer - lag(Odometer, default = Odometer[1]))
I'm trying to make a reset condition based on the factor column for a cumulative sum.
The result needs to be exactly like this.
# Wanted result dataframe
df <- data.frame(OilChanged = c("No","No","Yes","No","No","No","No","No","No","No","No","Yes","No"),
Odometer = c(300,350,410,420,430,450,500,600,600,600,650,660,700),
Diff = c(0,50,60,10,10,20,50,100,0,0,50,10,40),
CumSum = c(0,50,110,10,20,40,90,190,190,190,240,250,40))
You can create a new group everytime OilChanged == 'Yes' and take cumsum of Diff value in each group.
library(dplyr)
df %>%
group_by(grp = lag(cumsum(OilChanged == 'Yes'), default = 0)) %>%
mutate(newcumsum = cumsum(Diff)) %>%
ungroup %>%
select(-grp)
# OilChanged Odometer Diff CumSum newcumsum
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 No 300 0 0 0
# 2 No 350 50 50 50
# 3 Yes 410 60 110 110
# 4 No 420 10 10 10
# 5 No 430 10 20 20
# 6 No 450 20 40 40
# 7 No 500 50 90 90
# 8 No 600 100 190 190
# 9 No 600 0 190 190
#10 No 600 0 190 190
#11 No 650 50 240 240
#12 Yes 660 10 250 250
#13 No 700 40 40 40

cumulative sum by ID with lag

I want to create a cumulative sum by id. But, it should not sum the value that belongs to the row where is being calculated.
I've already tried with cumsum. However, I do not know how to add a statement which specifies to do not add the amount of the row where the sum is made. The result column I am looking for is the third column called: "sum".
For example, for id 1, the first row is sum=0, because should not add this row. But, for id 1 and row 2 sum=100 because the amount of id 1 previous to the row 2 was 100 and so on.
id amount sum
1: 1 100 0
2: 1 20 100
3: 1 150 120
4: 2 60 0
5: 2 100 60
6: 1 30 270
7: 2 40 160
This is what I've tried:
df[,sum:=cumsum(amount),
by ="id"]
data: df <- data.table(id = c(1, 1, 1, 2, 2,1,2), amount = c(100, 20,
150,60,100,30,40),sum=c(0,100,120,0,60,270,160) ,stringsAsFactors =
FALSE)
You can do this without using lag:
> df %>%
group_by(id) %>%
mutate(sum = cumsum(amount) - amount)
# A tibble: 7 x 3
# Groups: id [2]
id amount sum
<dbl> <dbl> <dbl>
#1 1 100 0
#2 1 20 100
#3 1 150 120
#4 2 60 0
#5 2 100 60
#6 1 30 270
#7 2 40 160
With dplyr -
df %>%
group_by(id) %>%
mutate(sum = lag(cumsum(amount), default = 0)) %>%
ungroup()
# A tibble: 7 x 3
id amount sum
<dbl> <dbl> <dbl>
1 1 100 0
2 1 20 100
3 1 150 120
4 2 60 0
5 2 100 60
6 1 30 270
7 2 40 160
Thanks to #thelatemail here's the data.table version -
df[, sum := cumsum(shift(amount, fill=0)), by=id]
Here is an option in base R
df$Sum <- with(df, ave(amount, id, FUN = cumsum) - amount)
df$Sum
#[1] 0 100 120 0 60 270 160
Or by removing the last observation, take the cumsum
with(df, ave(amount, id, FUN = function(x) c(0, cumsum(x[-length(x)]))))
You can shift the values you're summing by using the lag function.
library(tidyverse)
df <- data.frame(id = c(1, 1, 1, 2, 2,1,2), amount = c(100, 20,
150,60,100,30,40),sum=c(0,100,120,0,60,270,160) ,stringsAsFactors =
FALSE)
df %>%
group_by(id) %>%
mutate(sum = cumsum(lag(amount, 1, default=0)))
# A tibble: 7 x 3
# Groups: id [2]
id amount sum
<dbl> <dbl> <dbl>
1 1 100 0
2 1 20 100
3 1 150 120
4 2 60 0
5 2 100 60
6 1 30 270
7 2 40 160

Calculating total sum of line segments overlapping on a line

I'm trying to calculate the total sum of overlapping line segments across a single line. With line A, the segments are disjointed, so it's pretty simple to calculate. However, with lines B and C, there are overlapping line segments, so it's more complicated. I would need to somehow exclude parts of the previous lines that already part of the total sum.
data = read.table(text="
line left_line right_line small_line left_small_line right_small_line
A 100 120 101 91 111
A 100 120 129 119 139
B 70 90 63 53 73
B 70 90 70 60 80
B 70 90 75 65 85
C 20 40 11 1 21
C 20 40 34 24 44
C 20 40 45 35 55", header=TRUE)
This should be the expected result.
result = read.table(text="
total_overlapping
A 0.6
B 0.75
C 0.85", header=TRUE)
EDIT: Added a picture to better illustrate what I'm trying to figure out. There's 3 different pictures of lines (solid red line), with line segments (the dashed lines) overlapping. The goal is to figure out how much of the dashed lines are covering/overlapping.
Line A
Line B
Line C
If I understand correctly, the small_line variable is irrelevant here. The rest of the columns can be used to get the sum of overlapping segments:
Step 1. Get the start & end point for each segment's overlap with the corresponding line:
library(dplyr)
data1 <- data %>%
rowwise() %>%
mutate(overlap.start = max(left_line, left_small_line),
overlap.end = min(right_line, right_small_line)) %>%
ungroup() %>%
select(line, overlap.start, overlap.end)
> data1
# A tibble: 8 x 3
line overlap.start overlap.end
<fct> <int> <int>
1 A 100 111
2 A 119 120
3 B 70 73
4 B 70 80
5 B 70 85
6 C 20 21
7 C 24 40
8 C 35 40
Step 2. Within the rows corresponding to each line, sort the overlaps in order. consider it a new overlapping section if it is the first overlap, OR the previous overlap ends before it started. Label each new overlapping section:
data2 <- data1 %>%
arrange(line, overlap.start, overlap.end) %>%
group_by(line) %>%
mutate(new.section = is.na(lag(overlap.end)) |
lag(overlap.end) <= overlap.start) %>%
mutate(section.number = cumsum(new.section)) %>%
ungroup()
> data2
# A tibble: 8 x 5
line overlap.start overlap.end new.section section.number
<fct> <int> <int> <lgl> <int>
1 A 100 111 TRUE 1
2 A 119 120 TRUE 2
3 B 70 73 TRUE 1
4 B 70 80 FALSE 1
5 B 70 85 FALSE 1
6 C 20 21 TRUE 1
7 C 24 40 TRUE 2
8 C 35 40 FALSE 2
Step 3. Within each overlapping section, take the earliest starting point & the latest ending point. Calculate the length of each overlap:
data3 <- data2 %>%
group_by(line, section.number) %>%
summarise(overlap.start = min(overlap.start),
overlap.end = max(overlap.end)) %>%
ungroup() %>%
mutate(overlap = overlap.end - overlap.start)
> data3
# A tibble: 5 x 5
line section.number overlap.start overlap.end overlap
<fct> <int> <dbl> <dbl> <dbl>
1 A 1 100 111 11
2 A 2 119 120 1
3 B 1 70 85 15
4 C 1 20 21 1
5 C 2 24 40 16
Step 4. Sum the length of overlaps for each line:
data4 <- data3 %>%
group_by(line) %>%
summarise(overlap = sum(overlap)) %>%
ungroup()
> data4
# A tibble: 3 x 2
line overlap
<fct> <dbl>
1 A 12
2 B 15
3 C 17
Now, your expected result shows the expected percentage of overlap on each line, rather than the sum. If that's what you are looking for, you can add the length for each line to data4, & calculate accordingly:
data5 <- data4 %>%
left_join(data %>%
select(line, left_line, right_line) %>%
unique() %>%
mutate(length = right_line - left_line) %>%
select(line, length),
by = "line") %>%
mutate(overlap.percentage = overlap / length)
> data5
# A tibble: 3 x 4
line overlap length overlap.percentage
<fct> <dbl> <int> <dbl>
1 A 12 20 0.6
2 B 15 20 0.75
3 C 17 20 0.85

Resources