substracting value from value above automatically - no loop - r

I am trying to get a balance value from a df that looks like this
df1
Name Year Ch1 Origin
A 1995 x1 a
A 1996 x2 b
A 1997 x3 a
A 2000 x4 a
B 1997 y1 c
B 1998 y2 c
.....
while Ch1 is numerical.
and I want to add an extra col to have this value:
Name Year Ch1 Bil
A 1995 x1
A 1996 x2 %of year before (x2-x1/x1*100)%
A 1997 x3 %of year before (x3-x2/x2*100)%
A 2000 x4 %of year before (x4-x3/x3*100)%
B 1997 y1
B 1998 y2 %of year before (y2-y1/x1*100)%
.....
now I know I could create a loop looking something like this:
for (i in nrow(df1))
if (df[i,1]==df[i-1,1]) {
df$Bil<-(df[i,3]-df[i-1,3])/df[i-1,3]*100
} else ...
Is there a more elegant or quicker way to calculate this? This way I really need to make sure that the dataset is in the right order (going from older to recent years). Lets say also dependant on an extra detail such as origin so that the calculation only happens if name and origin are the same?
Thank you!

All three solutions require the data be in the correct order within each Name. For instance, you can interleave names of "A" and "B" (all groupings below will handle it), but the years should likely be non-decreasing.
Prepping by putting in real Ch1:
set.seed(42)
df1$Ch1 <- c(sort(sample(20, size=4)), sort(sample(20, size=2)))
Base R
df1 <- df1[order(df1$Name, df1$Year),]
df1$Bil <- ave(df1$Ch1, df1$Name, df1$Origin,
FUN=function(z) 100 * c(0, diff(z) / head(z, n = -1)))
df1
# Name Year Ch1 Origin Bil
# 1 A 1995 6 a 0.000000
# 2 A 1996 15 b 0.000000
# 3 A 1997 18 a 200.000000
# 4 A 2000 19 a 5.555556
# 5 B 1997 10 c 0.000000
# 6 B 1998 13 c 30.000000
dplyr
library(dplyr)
df1 %>%
arrange(Name, Year) %>%
group_by(Name, Origin) %>%
mutate(
Bil = 100 * c(0, diff(Ch1) / head(Ch1, n = -1))
) %>%
ungroup()
data.table
library(data.table)
library(magrittr)
df1DT <- as.data.table(df1)
setorder(df1DT, Name, Year)
df1DT[, Bil := 100 * c(0, diff(Ch1) / head(Ch1, n = -1)), by = .(Name, Origin)]

Related

R: update the values in df1 based on data in df2

Hi I have two data frames (df1 and df2) with two shared variables (ID and Yr). I want to update the values in a third variable (value) in df1 with the new data in the respective value in df2. But below code does not update the value in df1, it seems the values are not passed to the corresponding cels in df1.
df1 = data.frame(ID = c("a","b","c","d","e") ,
Yr = c(2000,2001,2002,2003,2004),
value= c(100,100,100,100, 100))
df2 = data.frame(ID = c("a","b","c") ,
Yr = c(2000,2001,2002),
valuenew= c(200,150,120))
for (i in 1:nrow(df2)){
id <- df2[i,'ID']
year <- df2[i, 'Yr']
valuenew<- df2[i, 'valuenew']
df1[which (df1$ID == id & df1$Yr == year), 'value'] <- valuenew
}
the desired result
ID Yr value
a 2000 200
b 2001 150
c 2002 120
d 2003 100
e 2004 100
The real data I use with which none of these solutions works
df1
head(df1, 5)
CoreID Yr FluxTot
1 Asmund2000_Greenland coast_4001 1987 0.3239693
2 Asmund2000_Greenland coast_4001 1986 0.2864100
3 Asmund2000_Greenland coast_4001 1985 0.2488508
4 Asmund2000_Greenland coast_4001 1984 0.2964794
5 Asmund2000_Greenland coast_4001 1983 0.3441080
df2
head(df2, 5)
CoreID Yr GamfitHgdep
1 Beal2015_Mount Logan 2000 0.01105077
2 Eyrikh2017_Belukha glacier 2000 0.02632597
3 Zheng2014_Mt. Oxford 2000 0.01377599
4 Zheng2014_Agassiz 2000 0.01940151
5 Zheng2014_NEEM-2010-S3 2000 -0.01483026
#merged database
m<-merge(df1, df2)
head(m,5)
CoreID Yr FluxTot GamfitHgdep
1 Beal2014_Yanacocha 2000 0.003365556 0.024941373
2 Beal2014_Yanacocha 2001 0.003423333 0.027831253
3 Beal2014_Yanacocha 2002 0.003481111 -0.002908330
4 Beal2014_Yanacocha 2003 0.003538889 -0.004591100
5 Beal2014_Yanacocha 2004 0.003596667 0.005189858
Below is the exact code I used to do the trick but failed. No difference if the value assigning part is replaced with any other solutions. No warning, no error raised.
library(readxl)
library(dplyr)
metal = 'Hg'
df = read_excel('All core data.xlsx','Sheet1')
df = data.frame(df)
df1 <- df[which (df$Metal==metal),]
rownames(df1) = seq(length=nrow(df1))
head(df1, 5)
dfgam = read_excel('GAM prediction.xlsx','Sheet1')
df2 <- data.frame(dfgam)
head(df2, 5)
for (i in 1:nrow(df2)){
coreid <- df2[i,'CoreID']
year <- df2[i, 'Yr']
predicted<- df2[i, 'GamfitHgdep']
df1[which (df1$CoreID == coreid & df1$Yr == year), 'FluxTot'] <- predicted
}
after running the code, the values in df1 have not changed, for instance
the value should be 0.024941373 as shown in head(m,5)
Since dplyr version 1.0.0, you can use rows_update for this:
dplyr::rows_update(
df1,
rename(df2, value=valuenew),
by = c("ID", "Yr")
)
# ID Yr value
# 1 a 2000 200
# 2 b 2001 150
# 3 c 2002 120
# 4 d 2003 100
# 5 e 2004 100
We could use a join for this: For example left_join
library(dplyr)
left_join(df1, df2, by="ID") %>%
mutate(value = ifelse(!is.na(valuenew), valuenew, value)) %>%
select(ID, Yr=Yr.x, value)
ID Yr value
1 a 2000 200
2 b 2001 150
3 c 2002 120
4 d 2003 100
5 e 2004 100
Option using data.table:
df1 = data.frame(ID = c("a","b","c","d","e") ,
Yr = c(2000,2001,2002,2003,2004),
value= c(100,100,100,100, 100))
df2 = data.frame(ID = c("a","b","c") ,
Yr = c(2000,2001,2002),
valuenew= c(200,150,120))
library(data.table)
setDT(df1)[df2, value := i.valuenew, on = .(ID, Yr)]
df1
#> ID Yr value
#> 1: a 2000 200
#> 2: b 2001 150
#> 3: c 2002 120
#> 4: d 2003 100
#> 5: e 2004 100
Created on 2022-07-05 by the reprex package (v2.0.1)
Your example is working and updating df1 just fine.
However, to add one more solution, you can try the lines below without using a for loop or attaching extra packages:
key <- paste(df1$ID, df1$Yr)
values <- setNames(df2$value, paste(df2$ID, df2$Yr))[key]
df1$value[!is.na(values)] <- values[!is.na(values)]
Maybe something worth to mention in general for your problem, make sure you don't have any duplicated ID/Yr combinations in df2...
EDIT:
Sorry, I was terrible at helping you! Providing just another working solution is not helpful at all. So here's my attempt to help you further.
First, check that you have the classes/types that you expect for the columns that you compare.
Next - usually I'd recommend placing a browser() in your code (e.g. before your assignment/last line in your example:
for (i in 1:nrow(df2)){
id <- df2[i,'ID']
year <- df2[i, 'Yr']
valuenew<- df2[i, 'valuenew']
browser()
df1[which (df1$ID == id & df1$Yr == year), 'value'] <- valuenew
}
This is especially helpful if you need to debug a function. However in your case you can step through your for loop manually, which is a bit simpler to handle:
Assign the first value to your iterator i <- 1 and run the code inside your for loop. Is which(df1$ID == id & df1$Yr == year) really returning what you expect?
If you can't find any issues, increment i by 1 and proceed with debugging...
You can try this for loop
for(i in 1:nrow(df1)){
y <- which(df1$Yr[i] == df2$Yr)
if(length(y) > 0) df1$value[i] <- df2$valuenew[y]
}
Output
ID Yr value
1 a 2000 200
2 b 2001 150
3 c 2002 120
4 d 2003 100
5 e 2004 100

Delete duplicates with multiple grouping conditions

I want to delete duplicates with multiple grouping conditions but always get way less results than expected.
The dataframe compares two companies per year. Like this:
year
c1
c2
2000
a
b
2000
a
c
2000
a
d
2001
a
b
2001
b
d
2001
a
c
For every c1 I want to look at c2 and delete rows which are in the previous year.
I found a similar problem but with just one c. Here are some of my tries so far:
df<- df%>%
group_by(c1,c2) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
df<- sqldf("select a.*
from df a
left join df b on b.c1=a.c1 and b.c2 = a.c2 and b.year = a.year - 1
where b.year is null")
The desired output for the example would be:
year
c1
c2
2000
a
b
2000
a
c
2000
a
d
2001
b
d
Assuming you want to check duplicate in the previous year only. So showing it to you on a modified sample
library(tidyverse)
df <- read.table(header = T, text = 'year c1 c2
2000 a b
2000 a c
2000 a d
2001 a b
2001 b d
2001 a c
2002 a d')
df %>%
filter(map2_lgl(df$year, paste(df$c1, df$c2), ~ !paste(.x -1, .y) %in% paste(df$year, df$c1, df$c2)))
#> year c1 c2
#> 1 2000 a b
#> 2 2000 a c
#> 3 2000 a d
#> 4 2001 b d
#> 5 2002 a d
Created on 2021-07-08 by the reprex package (v2.0.0)
Some of the other solutions won't work because I think they ignore the fact that you will probably have many years and want to eliminate duplicates from only the prior.
Here is something fairly simple. You could do this in some map function or whatnot, but sometimes a simple loop does just fine. For each year of data, use anti_join() to return only those values from the current year which are not in the prior year. Then just restack the data.
df_split <- df %>%
group_split(year)
for (this_year in 2:length(df_split)) {
df_split[[this_year]] <- df_split[[this_year]] %>%
anti_join(df_split[[this_year - 1]], by = c("c1", "c2"))
}
bind_rows(df_split)
# # A tibble: 4 x 3
# year c1 c2
# <int> <chr> <chr>
# 1 2000 a b
# 2 2000 a c
# 3 2000 a d
# 4 2001 b d
Edit
Another approach is to add a dummy column for the prior year and just use an anti_join() with that. This is probably what I would do.
df %>%
mutate(prior_year = year - 1) %>%
anti_join(df, by = c(prior_year = "year", "c1", "c2")) %>%
select(-prior_year)
You can also use the following solution.
library(dplyr)
library(purrr)
df %>%
filter(pmap_int(list(df$c1, df$c2, df$year), ~ df %>%
filter(year %in% c(..3, ..3 - 1)) %>%
rowwise() %>%
mutate(output = all(c(..1, ..2) %in% c_across(c1:c2))) %>%
pull(output) %>% sum) < 2)
# AnilGoyal's modified data set
year c1 c2
1 2000 a b
2 2000 a c
3 2000 a d
4 2001 b d
5 2002 a d
this will only keep the data u want.
The datais your data frame.
data[!duplicated(data[,2:3]),]
I think this is pretty simple with base duplicated using the fromLast option to get the last rather than the first entry. (It does assum the ordering by year.
dat[!duplicated(dat[2:3], fromLast=TRUE), ] # negate logical vector in i-position
year c1 c2
3 2000 a d
4 2001 a b
5 2001 b d
6 2001 a c
I do get a different result than you said was expected so maybe I misunderstood the specifications?
Assuming, that you indeed wanted to keep your last year, as stated in the question, but contrary to your example table, you could simply use slice:
library(dplyr)
df = data.frame(year=c("2000","2000","2000","2001","2001","2001"),
c1 = c("a","a","a","a","b","a"),c2=c("b","c","d","b","d","c"))
df %>% group_by(c1,c2) %>%
slice_tail() %>%arrange(year,c1,c2)
Use slice_head(), if you wanted the first year.
Here is the documentation: slice

R join tables by variable with multiple year observations

I have multiple tables all with the same variable names that I want to join by an ID, but each table represents another year. If I use an inner.join, it will correctly only keep those IDs in each table, but it will then create new variables for observations (i.e. X becomes X.x and X.y in the same row). I could use rbind, but that would keep all the data when I only want those that appear in each table.
library(dplyr)
df1 <- data.frame(x1 = 1:3,
x2 = c(12,14,11),
year = 2020)
df2 <- data.frame(x1 = 2:4,
x2 = c(15,17,13),
year = 2021)
dfall <- inner_join(df1,df2,by="x1")
This results in:
x1 x2.x year.x x2.y year.y
2 14 2020 15 2021
3 11 2020 17 2021
But I want this:
x1 x2 year
2 14 2020
2 15 2021
3 11 2020
3 17 2021
Is there a join where I can do this?
dplyr::bind_rows and then filter would work:
bind_rows(df1, df2) %>%
filter(x1 %in% intersect(df1$x1, df2$x1))
You can pipe the output to arrange(x1) to sort the output if needed.
Output
x1 x2 year
1 2 14 2020
2 3 11 2020
3 2 15 2021
4 3 17 2021
library(tidyr) # pivot_longer
inner_join(df1,df2,by="x1") %>%
pivot_longer(-x1, names_pattern="(.*)\\.(.*)",
names_to=c(".value", "val")) %>%
select(-val)
# # A tibble: 4 x 3
# x1 x2 year
# <int> <dbl> <dbl>
# 1 2 14 2020
# 2 2 15 2021
# 3 3 11 2020
# 4 3 17 2021
Try this. It's an inner join of your two approaches so far.
dfall <- inner_join(rbind(df1, df2) , inner_join(df1, df2 , by="x1") %>% select(x1))
Here's another option. It creates a column n which is equal to the number of times that each x1 appears, and then filters only those which appear as many times as there distinct values for year. You could change n==length(unique(year)) to n>=2 if you wanted any records that appear in more than one year/table, as opposed to those which appear in every year/table. This one is nice because it is easy to scale up to a large number of input tables.
dfall <- rbind(df1, df2) %>%
add_count(x1) %>%
filter(n==length(unique(year))) %>%
select(-n)

Subset a dataframe, calculate the mean and populate a dataframe in a loop in R

I have a set of 85 possible combinations from two variables, one with five values (years) and one with 17 values (locations). I make a dataframe that has the years in the first column and the locations in the second column. For each combination of year and location I want to calculate the weighted mean value and then add it to the third column, according to the year and location values.
My code is as follows:
for (i in unique(data1$year)) {
for (j in unique(data1$location)) {
data2 <- crossing(data1$year, data1$location)
dataname <- subset(data1, year %in% i & location %in% j)
result <- weighted.mean(dataname$length, dataname$raising_factor, na.rm = T)
}
}
The result I gets puts the last calculated mean in the third column for each row.
How can I get it to add according to matching year and location combination?
thanks.
A base R option would be by
by(df[c('x', 'y')], df[c('group', 'year')],
function(x) weighted.mean(x[,1], x[,2]))
Based on #LAP's example
As #A.Suleiman suggested, we can use dplyr::group_by.
Example data:
df <- data.frame(group = rep(letters[1:5], each = 4),
year = rep(2001:2002, 10),
x = 1:20,
y = rep(c(0.3, 1, 1/0.3, 0.4), each = 5))
library(dplyr)
df %>%
group_by(group, year) %>%
summarise(test = weighted.mean(x, y))
# A tibble: 10 x 3
# Groups: group [?]
group year test
<fctr> <int> <dbl>
1 a 2001 2.000000
2 a 2002 3.000000
3 b 2001 6.538462
4 b 2002 7.000000
5 c 2001 10.538462
6 c 2002 11.538462
7 d 2001 14.000000
8 d 2002 14.214286
9 e 2001 18.000000
10 e 2002 19.000000

How to apply a set of functions to each group of a grouping variable in R data.frame

I need to reshape data.frame in R in one step.
In short, change of values of objects (x1 to x6) is visible row by row (from 1990 to 1995):
> tab1[1:10, ] # raw data see plot for tab1
id value year
1 x1 7 1990
2 x1 10 1991
3 x1 11 1992
4 x1 7 1993
5 x1 3 1994
6 x1 1 1995
7 x2 6 1990
8 x2 7 1991
9 x2 9 1992
10 x2 5 1993
I am able to do reshaping step by step, does anybody know how do it in one step?
Original data
Table 1 - see that minimal value from all timeseries is "0"
Step1:
Table 2 - rescale each timeseries that each would have minimal value equal "0".
All times fall down on x-axes.
Step2:
Table 3 - apply diff() function on each timeline.
Step3:
Table 4 - apply sort() function on each timeseries.
I hope the pictures are clear enough for understanding each step.
So final table looks like this:
> tab4[1:10, ]
id value time
1 x1 -4 1
2 x1 -4 2
3 x1 -2 3
4 x1 1 4
5 x1 3 5
6 x2 -4 1
7 x2 -3 2
8 x2 1 3
9 x2 1 4
10 x2 2 5
# Source data:
tab1 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 6),
value = c(7,10,11,7,3,1,6,7,9,5,2,3,11,9,7,9,1,
0,1,2,2,4,7,4,2,3,1,6,4,2,3,5,4,3,5,6),
year = rep(c(1990:1995), times = 6))
tab2 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 6),
value = c(6,9,10,6,2,0,4,5,7,3,0,1,11,9,7,9,1,0,
0,1,1,3,6,3,1,2,0,5,3,1,0,2,1,0,2,3),
year = rep(c(1990:1995), times = 6))
tab3 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 5),
value = c(3,1,-4,-4,-2,1,2,-4,-3,1,-2,-2,2,-8,-1,
1,0,2,3,-3,1,-2,5,-2,-2,2,-1,-1,2,1),
time = rep(c(1:5), times = 6))
tab4 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 5),
value = c(-4,-4,-2,1,3,-4,-3,1,1,2,-8,-2,-2,-1,2,
-3,0,1,2,3,-2,-2,-2,1,5,-1,-1,1,2,2),
time = rep(c(1:5), times = 6))
Using data.table, this is simply:
require(data.table) ## 1.9.2
ans <- setDT(tab1)[, list(value=diff(value)), by=id] ## aggregation
setkey(ans, id,value)[, time := seq_len(.N), by=id] ## order + add 'time' column
Note that your 'step 1' is unnecessary as your second step is calculating difference and it wouldn't have any effect (and is therefore skipped here).
It sounds like you want to apply a set of functions to each group of a grouping variable. There are many ways to do this in R (from base R by and tapply to add-on packages like plyr, data.table, and dplyr). I've been learning how to use package dplyr, and came up with the following solution.
require(dplyr)
tab4 = tab1 %>%
group_by(id) %>% # group by id
mutate(value = value - min(value), value = value - lag(value)) %>% # group min to 0, difference lag 1
na.omit %>% # remove NA caused by lag 1 differencing
arrange(id, value) %>% # order by value within each id
mutate(time = 1:length(value)) %>% # Make a time variable from 1 to 5 based on current order
select(-year) # remove year column to match final OP output

Resources