Arithmetic operation between group by two - r

Using the following data:
set.seed(1234)
df1 <- structure(
list(wavelength = c(400, 400, 400, 400, 400, 400, 400, 400, 500, 500, 500, 500, 500, 500, 500, 500),
depth = c(0, 30, 40, 60, 79, 89, 101, 110, 0, 30, 40, 60, 79, 89, 101, 110),
value = sample(16)),
class = "data.frame", row.names = c(NA, -16L), .Names = c("wavelength", "depth", "value"))
df1
#> wavelength depth value
#> 1 400 0 2
#> 2 400 30 10
#> 3 400 40 9
#> 4 400 60 14
#> 5 400 79 11
#> 6 400 89 8
#> 7 400 101 1
#> 8 400 110 3
#> 9 500 0 6
#> 10 500 30 4
#> 11 500 40 5
#> 12 500 60 13
#> 13 500 79 16
#> 14 500 89 12
#> 15 500 101 15
#> 16 500 110 7
How is it possible to group the data by wavelength and then calculate res in such way that it represents an arithmetic operation between pairs of value. In this example, the res is simply the sum of square between pairs. res[1] is simply 2^2 + 10^2 and res[2] is 10^2 + 9^2 and so on.
df2 <- structure(
list(wavelength = c(400, 400, 400, 400, 400, 400, 400, 500, 500, 500, 500, 500, 500, 500),
depth = rep(c("0-30", "30-40", "40-60", "60-79", "79-89", "89-101", "101-110"), 2),
res = c(104, 181, 277, 317, 185, 65, 45, 52, 41, 194, 425, 400, 369, 274)),
class = "data.frame", row.names = c(NA, -14L), .Names = c("wavelength", "depth", "res"))
df2
#> wavelength depth res
#> 1 400 0-30 104
#> 2 400 30-40 181
#> 3 400 40-60 277
#> 4 400 60-79 317
#> 5 400 79-89 185
#> 6 400 89-101 65
#> 7 400 101-110 45
#> 8 500 0-30 52
#> 9 500 30-40 41
#> 10 500 40-60 194
#> 11 500 60-79 425
#> 12 500 79-89 400
#> 13 500 89-101 369
#> 14 500 101-110 274
Ideally, the answer would use the dplyr syntax.
Update
Based on received answers I came up with this solution.
f1 <- function(x, y) {
return(x^2 + y^2)
}
df1 %>%
group_by(wavelength) %>%
mutate(depth = paste(depth, lead(depth), sep = "-")) %>%
mutate(res = f1(value, c(lead(value)))) %>%
na.omit()
#> Source: local data frame [14 x 4]
#> Groups: wavelength [2]
#>
#> wavelength depth value res
#> <dbl> <chr> <int> <dbl>
#> 1 400 0-30 2 104
#> 2 400 30-40 10 181
#> 3 400 40-60 9 277
#> 4 400 60-79 14 317
#> 5 400 79-89 11 185
#> 6 400 89-101 8 65
#> 7 400 101-110 1 10
#> 8 500 0-30 6 52
#> 9 500 30-40 4 41
#> 10 500 40-60 5 194
#> 11 500 60-79 13 425
#> 12 500 79-89 16 400
#> 13 500 89-101 12 369
#> 14 500 101-110 15 274

After grouping by 'wavelength', create the 'depth' column by pasteing the 'depth' with the 'lead' of 'depth' and 'value' by the difference of adjacent elements (diff), then remove the NA elements with na.omit
library(dplyr)
df1 %>%
group_by(wavelength) %>%
mutate(depth = paste(depth, lead(depth), sep="-"),
value = c(diff(value), NA)) %>% na.omit()
# wavelength depth value
# <dbl> <chr> <int>
#1 400 0-30 8
#2 400 30-40 -1
#3 400 40-60 5
#4 400 60-79 -3
#5 400 79-89 -3
#6 400 89-101 -7
#7 400 101-110 2
#8 500 0-30 -2
#9 500 30-40 1
#10 500 40-60 8
#11 500 60-79 3
#12 500 79-89 -4
#13 500 89-101 3
#14 500 101-110 -8

Related

Divide columns by a reference row

I need to divide columns despesatotal and despesamonetaria by the row named Total:
Lets suppose your data set is df.
# 1) Delete the last row
df <- df[-nrow(df),]
# 2) Build the desired data.frame [combining the CNAE names and the proportion columns
new.df <- cbind(grup_CNAE = df$grup_CNAE,
100*prop.table(df[,-1],margin = 2))
Finally, rename your columns. Be careful with the matrix or data.frame formats, because sometimes mathematical operations may suppose a problem. If you you use dput function in order to give us a reproducible example, the answer would be more accurate.
Here is a way to get it done. This is not the best way, but I think it is very readable.
Suppose this is your data frame:
mydf = structure(list(grup_CNAE = c("A", "B", "C", "D", "E", "Total"
), despesatotal = c(71, 93, 81, 27, 39, 311), despesamonetaria = c(7,
72, 36, 22, 73, 210)), row.names = c(NA, -6L), class = "data.frame")
mydf
# grup_CNAE despesatotal despesamonetaria
#1 A 71 7
#2 B 93 72
#3 C 81 36
#4 D 27 22
#5 E 39 73
#6 Total 311 210
To divide despesatotal values with its total value, you need to use the total value (311 in this example) as the denominator. Note that the total value is located in the last row. You can identify its position by indexing the despesatotal column and use nrow() as the index value.
mydf |> mutate(percentage1 = despesatotal/despesatotal[nrow(mydf)],
percentage2 = despesamonetaria /despesamonetaria[nrow(mydf)])
# grup_CNAE despesatotal despesamonetaria percentage1 percentage2
#1 A 71 7 0.22829582 0.03333333
#2 B 93 72 0.29903537 0.34285714
#3 C 81 36 0.26045016 0.17142857
#4 D 27 22 0.08681672 0.10476190
#5 E 39 73 0.12540193 0.34761905
#6 Total 311 210 1.00000000 1.00000000
library(tidyverse)
Sample data
# A tibble: 11 x 3
group despesatotal despesamonetaria
<chr> <int> <int>
1 1 198 586
2 2 186 525
3 3 202 563
4 4 300 562
5 5 126 545
6 6 215 529
7 7 183 524
8 8 163 597
9 9 213 592
10 10 175 530
11 Total 1961 5553
df %>%
mutate(percentage_total = despesatotal / last(despesatotal),
percentage_monetaria = despesamonetaria/ last(despesamonetaria)) %>%
slice(-nrow(.))
# A tibble: 10 x 5
group despesatotal despesamonetaria percentage_total percentage_monetaria
<chr> <int> <int> <dbl> <dbl>
1 1 198 586 0.101 0.106
2 2 186 525 0.0948 0.0945
3 3 202 563 0.103 0.101
4 4 300 562 0.153 0.101
5 5 126 545 0.0643 0.0981
6 6 215 529 0.110 0.0953
7 7 183 524 0.0933 0.0944
8 8 163 597 0.0831 0.108
9 9 213 592 0.109 0.107
10 10 175 530 0.0892 0.0954
This is a good place to use dplyr::mutate(across()) to divide all relevant columns by the Total row. Note this is not sensitive to the order of the rows and will apply the manipulation to all numeric columns. You can supply any tidyselect semantics to across() instead if needed in your case.
library(tidyverse)
# make sample data
d <- tibble(grup_CNAE = paste0("Group", 1:12),
despesatotal = sample(1e6:5e7, 12),
despesamonetaria = sample(1e6:5e7, 12)) %>%
add_row(grup_CNAE = "Total", summarize(., across(where(is.numeric), sum)))
# divide numeric columns by value in "Total" row
d %>%
mutate(across(where(is.numeric), ~./.[grup_CNAE == "Total"]))
#> # A tibble: 13 × 3
#> grup_CNAE despesatotal despesamonetaria
#> <chr> <dbl> <dbl>
#> 1 Group1 0.117 0.0204
#> 2 Group2 0.170 0.103
#> 3 Group3 0.0451 0.0837
#> 4 Group4 0.0823 0.114
#> 5 Group5 0.0170 0.0838
#> 6 Group6 0.0174 0.0612
#> 7 Group7 0.163 0.155
#> 8 Group8 0.0352 0.0816
#> 9 Group9 0.0874 0.135
#> 10 Group10 0.113 0.0877
#> 11 Group11 0.0499 0.0495
#> 12 Group12 0.104 0.0251
#> 13 Total 1 1
Created on 2022-11-08 with reprex v2.0.2

How to calculate the ratio between column(2) and column(3) while the value of column(1) is the same in R?

Consider this dataset:
example = data.frame("person"= rep(c(1:3), each = 4),
"expected income" = c(seq(140,250,10)),
"income" = c(seq(110,220,10)))
print(example)
I need to calculate the ratio between column(2) in year(i) and column(3) in year (i+1).
Furthermore the ratio has to be done only when the "person" (col1) is the same.
Instead of the ratio between the "expected income" and the "income" of two different people I need an NA.
It has to be done generically since it is just a semplification of a dataset with more than 60000 rows.
Are you looking for:
example %>%
group_by(person) %>%
mutate(ratio = lag(expected.income)/income) %>%
ungroup()
# A tibble: 12 x 4
person expected.income income ratio
<int> <dbl> <dbl> <dbl>
1 1 140 110 NA
2 1 150 120 1.17
3 1 160 130 1.15
4 1 170 140 1.14
5 2 180 150 NA
6 2 190 160 1.12
7 2 200 170 1.12
8 2 210 180 1.11
9 3 220 190 NA
10 3 230 200 1.1
11 3 240 210 1.10
12 3 250 220 1.09
Alternative:
example %>%
group_by(person) %>%
mutate(ratio = expected.income/lead(income)) %>%
ungroup()
# A tibble: 12 x 4
person expected.income income ratio
<int> <dbl> <dbl> <dbl>
1 1 140 110 1.17
2 1 150 120 1.15
3 1 160 130 1.14
4 1 170 140 NA
5 2 180 150 1.12
6 2 190 160 1.12
7 2 200 170 1.11
8 2 210 180 NA
9 3 220 190 1.1
10 3 230 200 1.10
11 3 240 210 1.09
12 3 250 220 NA
It carries the same information, just in different rows now.

Find closest multiple above and below value R

I am trying to determine the values above and below a variable that fall on a multiple, preferably in a dplyr::mutate.
In this simplified example. I want to determine the multiple of 50 both above and below my values of x. I am under the impression cut is what I should use, but I haven't gotten it to work.
df <- data.frame(
x = c(265, 617, 88, 99, 143, 378)
)
x
1 265
2 617
3 88
4 99
5 143
6 378
desired_result <- data.frame(
x = c(265, 617, 88, 99, 143, 378),
above = c(300, 650, 100, 100, 150, 400),
below = c(250, 600, 50, 50, 100, 350)
)
x above below
1 265 300 250
2 617 650 600
3 88 100 50
4 99 100 50
5 143 150 100
6 378 400 350
df$above = ceiling(df$x/50)*50
df$below = floor(df$x/50)*50
x above below
1 265 300 250
2 617 650 600
3 88 100 50
4 99 100 50
5 143 150 100
6 378 400 350
You could use the modulus operator %%:
df %>%
mutate(above = x - (x %% 50) + 50,
below = x - (x %% 50))
Output:
x above below
1 265 300 250
2 617 650 600
3 88 100 50
4 99 100 50
5 143 150 100
6 378 400 350

conditional Substacting numbers

I have data frame like this
test <- data.frame(gr=rep(letters[1:2],each=6),No=c(100:105,200:205))
gr No
1 a 100
2 a 101
3 a 102
4 a 103
5 a 104
6 a 105
7 b 200
8 b 201
9 b 202
10 b 203
11 b 204
12 b 205
in the No column the numbers are increasing in each gr. I need to sum gr a with 100 and b with 50 and need to have consecutive decrease after this operation.
I would like to have a new column that consecutive decrease with this increase. So I tried
decrese_func <- function(No,gr){
if(any(gr=="a")){
No+100
}
else
No+50
}
test%>%
group_by(gr)%>%
mutate(new_column=decrese_func(No,gr))
# A tibble: 12 x 3
# Groups: gr [2]
gr No new_column
<fct> <int> <dbl>
1 a 100 200
2 a 101 201
3 a 102 202
4 a 103 203
5 a 104 204
6 a 105 205
7 b 200 250
8 b 201 251
9 b 202 252
10 b 203 253
11 b 204 254
12 b 205 255
but what I need is like this
gr No new_column
<fct> <int> <dbl>
1 a 100 200
2 a 101 199
3 a 102 198
4 a 103 197
5 a 104 196
6 a 105 195
7 b 200 250
8 b 201 249
9 b 202 248
10 b 203 247
11 b 204 246
12 b 205 245
I cannot figure it out how to have consecutive decrease ?
Thx.
Not the most elegant answer but in the mean time, this may work:
library(dplyr)
test %>%
mutate(A = case_when(gr == "a" ~ 100,
gr == "b" ~ 50,
TRUE ~ NA_real_)) %>%
group_by(gr) %>%
mutate(B = (1:NROW(gr) - 1) * 2,
New_Column = No + A - B)
# A tibble: 12 x 5
# Groups: gr [2]
gr No A B New_Column
<fct> <int> <dbl> <dbl> <dbl>
1 a 100 100 0 200
2 a 101 100 2 199
3 a 102 100 4 198
4 a 103 100 6 197
5 a 104 100 8 196
6 a 105 100 10 195
7 b 200 50 0 250
8 b 201 50 2 249
9 b 202 50 4 248
10 b 203 50 6 247
11 b 204 50 8 246
12 b 205 50 10 245
Add select(gr, No, New_Column) at the end of the chain to get gr, No and New_Column only. I left the other columns just to show what's going on.
And if you want to wrap it into a function you could do something like:
desc_func <- function(group_var, condition, if_true_add, if_false_add, to_number) {
ifelse(
group_var == condition,
to_number + if_true_add - (1:NROW(group_var) - 1) * 2,
to_number + if_false_add - (1:NROW(group_var) - 1) * 2)
}
test %>%
group_by(gr) %>%
mutate(test_var = desc_func(gr, "a", 100, 50, No))
# A tibble: 12 x 3
# Groups: gr [2]
gr No test_var
<fct> <int> <dbl>
1 a 100 200
2 a 101 199
3 a 102 198
4 a 103 197
5 a 104 196
6 a 105 195
7 b 200 250
8 b 201 249
9 b 202 248
10 b 203 247
11 b 204 246
12 b 205 245
Here is a way to do this in base R
test$New <- with(test, No + c(100, 50)[cumsum(!duplicated(gr))] - 2*(No %% 100))
test$New
#[1] 200 199 198 197 196 195 250 249 248 247 246 245
Or a slight variation with match
with(test, No + c(100, 50)[match(gr, unique(gr))] - 2*(No %% 100))

Rank() in R excluding zeros

I am trying to duplicated "manually" the example in this Wikipedia post using R.
Here is the data:
after = c(125, 115, 130, 140, 140, 115, 140, 125, 140, 135)
before = c(110, 122, 125, 120, 140, 124, 123, 137, 135, 145)
sgn = sign(after-before)
abs = abs(after - before)
d = data.frame(after,before,sgn,abs)
after before sgn abs
1 125 110 1 15
2 115 122 -1 7
3 130 125 1 5
4 140 120 1 20
5 140 140 0 0
6 115 124 -1 9
7 140 123 1 17
8 125 137 -1 12
9 140 135 1 5
10 135 145 -1 10
If I try to rank the rows based on the abs column, the 0 entry is naturally ranked as 1:
rank = rank(abs)
(d = data.frame(after,before,sgn,abs,rank))
after before sgn abs rank
1 125 110 1 15 8.0
2 115 122 -1 7 4.0
3 130 125 1 5 2.5
4 140 120 1 20 10.0
5 140 140 0 0 1.0
6 115 124 -1 9 5.0
7 140 123 1 17 9.0
8 125 137 -1 12 7.0
9 140 135 1 5 2.5
10 135 145 -1 10 6.0
However, zeros are ignored in the Wilcoxon signed-test.
How can I get R to ignore that row, so as to end up with:
after before sgn abs rank
1 125 110 1 15 7.0
2 115 122 -1 7 3.0
3 130 125 1 5 1.5
4 140 120 1 20 9.0
5 140 140 0 0 0
6 115 124 -1 9 4.0
7 140 123 1 17 8.0
8 125 137 -1 12 6.0
9 140 135 1 5 1.5
10 135 145 -1 10 5.0
SOLUTION (accepted answer below):
after = c(125, 115, 130, 140, 140, 115, 140, 125, 140, 135)
before = c(110, 122, 125, 120, 140, 124, 123, 137, 135, 145)
sgn = sign(after-before)
abs = abs(after - before)
d = data.frame(after,before,sgn,abs)
d$rank = rank(replace(abs,abs==0,NA), na='keep')
d$multi = d$sgn * d$rank
(W=abs(sum(d$multi, na.rm = T)))
9
From the Wikipedia article:
Exclude pairs with |x2,i − x1,i| = 0. Let Nr be the reduced sample size.
We need to exclude zeroes. By my thinking, you should replace zeroes with NA, and then specify to rank() that you want to exclude NAs from consideration for ranking. Since you need to return a vector of the same length as the input, you can specify 'keep' as the argument:
d$rank <- rank(replace(abs,abs==0,NA),na='keep');
d;
## after before sgn abs rank
## 1 125 110 1 15 7.0
## 2 115 122 -1 7 3.0
## 3 130 125 1 5 1.5
## 4 140 120 1 20 9.0
## 5 140 140 0 0 NA
## 6 115 124 -1 9 4.0
## 7 140 123 1 17 8.0
## 8 125 137 -1 12 6.0
## 9 140 135 1 5 1.5
## 10 135 145 -1 10 5.0
The subtraction-based solutions will not work if the input vector contains zero zeroes or multiple zeroes.
You could create the new column and then just update the rank where the abs value isn't 0
d$rank <- 0 # default value for rows with abs=0
d$rank[d$abs!=0] <- rank(d$abs[d$abs!=0])
If you wanted to drop the row completely, you could just do
transform(subset(d, abs!=0), rank=rank(abs))
A quick way to do it would be to rank as normal and then do:
d$rank <- ifelse(d$rank == 1, 0, d$rank - 1)
This switches all ranks of 1 to 0, and reduces any other ranks by 1.

Resources