I am new to R. I have a large dataframe with millions of rows that looks like below:
Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3
64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9
55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4
70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7
26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2
47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8
35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1
I am looking for three output fields depending on the values in code1, code2, and code3.
> Output1 : If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2 : If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3 : If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3
Would appreciate if this code below could be corrected:
df1 %>%
for (i in 1:6) {
if (paste0("code", i) %in% c("a", "b")) {
mutate (paste0("Output", i) = Whole * paste0("P_", i) )
} else {
mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
}
}
library(dplyr)
df1 %>%
mutate(
Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
# 2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
# 3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
# 4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
# 5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
# 6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
If your data is more generic (not hard-coded or many more than "3" sets of columns), then we can reshape the data, do the assignment, and shape it back.
library(tidyr)
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
pivot_wider(
id_cols = Whole,
names_from = set,
values_from = c(code, P, Q, Output),
names_sep = "_"
)
# # A tibble: 6 x 13
# Whole code_1 code_2 code_3 P_1 P_2 P_3 Q_1 Q_2 Q_3 Output_1 Output_2 Output_3
# <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 64 a b d 0.2 0.3 0.1 0.1 0.2 0.9 12.8 19.2 57.6
# 2 55 a c b 0.5 0.1 0.4 0.3 0.3 0.4 27.5 16.5 22
# 3 70 b d NULL 0.4 0.2 0.7 0.1 0.5 0.7 28 35 49
# 4 26 c a b 0.7 0.2 0.2 0.5 0.6 0.2 13 5.2 5.2
# 5 47 a d NULL 0.8 0.1 0.6 0.7 0.2 0.8 37.6 9.4 37.6
# 6 35 d b a 0.2 0.8 0.2 0.8 0.1 0.1 28 28 7
As a side note, in general I would recommend keeping it in the "long" format and not re-widening it. This "long" format is often preferred for other tidy-like functions (including ggplot2), and is easily extended to arbitrary counts. That would result in data like this:
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
# Whole set code P Q Output
# <int> <chr> <chr> <dbl> <dbl> <dbl>
# 1 64 1 a 0.2 0.1 12.8
# 2 64 2 b 0.3 0.2 19.2
# 3 64 3 d 0.1 0.9 57.6
# 4 55 1 a 0.5 0.3 27.5
# 5 55 2 c 0.1 0.3 16.5
# 6 55 3 b 0.4 0.4 22
# 7 70 1 b 0.4 0.1 28
# 8 70 2 d 0.2 0.5 35
# 9 70 3 NULL 0.7 0.7 49
# 10 26 1 c 0.7 0.5 13
# 11 26 2 a 0.2 0.6 5.2
# 12 26 3 b 0.2 0.2 5.2
# 13 47 1 a 0.8 0.7 37.6
# 14 47 2 d 0.1 0.2 9.4
# 15 47 3 NULL 0.6 0.8 37.6
# 16 35 1 d 0.2 0.8 28
# 17 35 2 b 0.8 0.1 28
# 18 35 3 a 0.2 0.1 7
(Much shorter.)
We can use map2. Get the names of the columns that have 'P', 'Q'. followed by digits after the _. then loop over the corresponding columns with map2, apply the transformation logic and bind the columns with the original dataset
library(dplyr)
library(purrr)
library(stringr)
ps <- names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <- names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
transmute(Output = Whole *
case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
TRUE ~ !! rlang::sym(.y)))) %>%
rename_all(~ str_remove(., fixed("..."))) %>%
bind_cols(df1, .)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
#2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
#3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
#4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
#5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
#6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
data
df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a",
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c",
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2,
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL",
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4,
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA,
-6L))
Depending on how many rows you have, this data.table approach may be faster.
library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
][Logical == TRUE,`:=`(Output1 = Whole * P_1,
Output2 = Whole * P_2,
Output3 = Whole * P_3)
][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
Output2 = Whole * Q_2,
Output3 = Whole * Q_3)
][,.(Output1,Output2,Output3)]
Output1 Output2 Output3
1: 12.8 19.2 6.4
2: 27.5 5.5 22.0
3: 28.0 14.0 49.0
4: 13.0 15.6 5.2
5: 37.6 4.7 28.2
6: 28.0 3.5 3.5
Related
I have a dataframe that looks like this:
data <- as.data.frame(cbind('01-01-2018' = c(1.2,3.1,0.7,-0.3,2.0), '02-01-2018' = c(-0.1, 2.4, 4.9,-3.3,-2.7), '03-01-2018' = c(3.4, -2.6, -1.8, 0.1, 0.3)))
01-01-2018 02-01-2018 03-01-2018
1 1.2 -0.1 3.4
2 3.1 2.4 -2.6
3 0.7 4.9 -1.8
4 -0.3 -3.3 0.1
5 2.0 -2.7 0.3
I want to count how many times per each row, a value is bigger than the average of the corresponding row.
data$mn <- apply(data, 1, mean)
01-01-2018 02-01-2018 03-01-2018 mn
1 1.2 -0.1 3.4 1.5000000
2 3.1 2.4 -2.6 0.9666667
3 0.7 4.9 -1.8 1.2666667
4 -0.3 -3.3 0.1 -1.1666667
5 2.0 -2.7 0.3 -0.1333333
My last attempt was the following:
df$events <- apply(data, 1, function(x) sum(x > data$mn))
uhi_events <- numeric(nrow(data))
for (i in 1:nrow(data)) {
uhi <- data[[6]][[i]][["values"]]
uhi_events[i] <- sum(uhi)
}
data$uhi_events <- uhi_events
Is there a more efficient option?
EDIT:
What if the condition is on another column, let's say data$c1, that is not obtained through a simple formula?
data$md <- apply(data, 1, median)
01-01-2018 02-01-2018 03-01-2018 md
1 1.2 -0.1 3.4 1.5000000
2 3.1 2.4 -2.6 0.9666667
3 0.7 4.9 -1.8 1.2666667
4 -0.3 -3.3 0.1 -1.1666667
5 2.0 -2.7 0.3 -0.1333333
Using rowMeans and rowSums:
data$cnt <- rowSums(data > rowMeans(data))
data
# 01-01-2018 02-01-2018 03-01-2018 cnt
# 1 1.2 -0.1 3.4 1
# 2 3.1 2.4 -2.6 2
# 3 0.7 4.9 -1.8 1
# 4 -0.3 -3.3 0.1 2
# 5 2.0 -2.7 0.3 2
If the column was already computed replace rowMeans with existing column data$c1:
#get index excluding "c1":
ix <- grep("c1", colnames(data), invert = TRUE)
data$cnt <- rowSums(data[, ix ] > data$c1)
Using a user defined function to sum from a logical operation (logical vector is coerced by sum() to an integer vector such that TRUE = 1 and FALSE = 0)
data$uhi_events <-
apply(data, 1, function(i){
sum(i>mean(i))
})
library(data.table)
setDT(data)
data[, above_mean := rowSums(.SD > rowMeans(.SD))]
# 01-01-2018 02-01-2018 03-01-2018 above_mean
# 1: 1.2 -0.1 3.4 1
# 2: 3.1 2.4 -2.6 2
# 3: 0.7 4.9 -1.8 1
# 4: -0.3 -3.3 0.1 2
# 5: 2.0 -2.7 0.3 2
edit for question in comments
compare to value in first column
data[, above_col1 := rowSums(.SD > `01-01-2018`)]
# 01-01-2018 02-01-2018 03-01-2018 above_col1
# 1: 1.2 -0.1 3.4 1
# 2: 3.1 2.4 -2.6 0
# 3: 0.7 4.9 -1.8 1
# 4: -0.3 -3.3 0.1 1
# 5: 2.0 -2.7 0.3 0
Using a dplyr approach:
library(dplyr)
data <- as.data.frame(cbind('01-01-2018' = c(1.2,3.1,0.7,-0.3,2.0), '02-01-2018' = c(-0.1, 2.4, 4.9,-3.3,-2.7), '03-01-2018' = c(3.4, -2.6, -1.8, 0.1, 0.3)))
data$mm <- apply(data,1,median)
data %>%
rowwise %>%
mutate(count = sum(c_across(1:3) > mm))
#> # A tibble: 5 × 5
#> # Rowwise:
#> `01-01-2018` `02-01-2018` `03-01-2018` mm count
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 1.2 -0.1 3.4 1.2 1
#> 2 3.1 2.4 -2.6 2.4 1
#> 3 0.7 4.9 -1.8 0.7 1
#> 4 -0.3 -3.3 0.1 -0.3 1
#> 5 2 -2.7 0.3 0.3 1
I got a set of data just like that:
df = data.frame(A = c(0.1, 0.3, 0.7, 0.9, 0.5, 0.4, 0.3, 0.3, 0.9, 0.9),
B = c(0.5, 0.4, 0.8, 0.6, 0.8, 0.5, 0.4, 0.5, 0.6, 0.5),
D = c(0.2, 0.1, 0.5, 0.8, 0.6, 0.7, 0.1, 0.3, 0.8, 0.3))
but i need to create a index for all unique combination of A, B and D. Just like that:
index A B D
1 1 0.1 0.5 0.2
2 2 0.3 0.4 0.1
3 3 0.7 0.8 0.5
4 4 0.9 0.6 0.8
5 5 0.5 0.8 0.6
6 6 0.4 0.5 0.7
7 2 0.3 0.4 0.1
8 7 0.3 0.5 0.3
9 4 0.9 0.6 0.8
10 8 0.9 0.5 0.3
Note that the combination between A, B and D is the same for rows 4 and 9 and for rows 2 and 7. Therefore, they receive the same index value
You can use the following code. Maybe the naming of indices have a slight difference than your output but the logic is the same:
library(dplyr)
df %>%
group_by(A, B, D) %>%
mutate(index = cur_group_id()) %>%
ungroup() %>%
arrange(index)
# A tibble: 10 x 4
A B D index
<dbl> <dbl> <dbl> <int>
1 0.1 0.5 0.2 1
2 0.3 0.4 0.1 2
3 0.3 0.4 0.1 2
4 0.3 0.5 0.3 3
5 0.4 0.5 0.7 4
6 0.5 0.8 0.6 5
7 0.7 0.8 0.5 6
8 0.9 0.5 0.3 7
9 0.9 0.6 0.8 8
10 0.9 0.6 0.8 8
We can use match
library(dplyr)
library(stringr)
df %>%
mutate(index = match(str_c(A, B, D), unique(str_c(A, B, D)))) %>%
arrange(index)
Another dplyr option
df %>%
distinct() %>%
mutate(index = 1:n()) %>%
left_join(x = df)
gives
A B D index
1 0.1 0.5 0.2 1
2 0.3 0.4 0.1 2
3 0.7 0.8 0.5 3
4 0.9 0.6 0.8 4
5 0.5 0.8 0.6 5
6 0.4 0.5 0.7 6
7 0.3 0.4 0.1 2
8 0.3 0.5 0.3 7
9 0.9 0.6 0.8 4
10 0.9 0.5 0.3 8
This may have already been answered but I could not find exactly what I wanted.
I have a data frame like:
Area <- c(1,1,1,1,2,2,2,2,3,3,3,3)
Scenario <- c(a,b,c,d,a,b,c,d,a,b,c,d)
Type <- c(EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV,)
Y2020 <- c(0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6)
Y2021 <- c(0.2,0.4,0.5,0.6,0.8,1.0,1.0,1.1,1.2,1.5,1.3,1.5)
y2022 <- c(0.3,0.6,0.2,0.7,0.5,0.6,0.7,0.8,0.9,1.1,1.3,1.6)
dt <- data.frame(Area,Scenario, Y2020, Y2021, Y2022)
So will look something like:
Area Scenario Type Y2020 Y2021 Y2022
1 a EV 0.5 0.2 0.3
1 b EV 0.6 0.4 0.6
1 c EV 0.7 0.5 0.8
1 d EV 0.8 0.6 0.7
2 a EV 0.9 0.8 0.5
2 b EV 1.0 1.0 0.6
2 c EV 1.1 1.0 0.7
2 d EV 1.2 1.1 0.8
3 a EV 1.3 1.2 0.9
3 b EV 1.4 1.5 1.1
3 c EV 1.5 1.3 1.3
3 d EV 1.6 1.5 1.6
I would like to get it in wide format by rotating by the Scenario column like this:
Area Type Y2020_a Y2021_a Y2022_a Y2020_b Y2021_b ...
1 EV 0.5 0.2 0.3 0.6 0.4
2 EV 0.9 0.8 0.5 1.0 1.0
3 EV 1.3 1.2 0.9 1.4 1.5
I tried to use dcast(dt, id ~ Scenario, value.var=names(dt)[4:6]) as suggested by #Arun from Reshape multiple values at once but it returned "Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2"
This is a condensed version of my actual data so if it could be replicated with a larger data set that would be great!
I hope someone can help! Thanks
A proposition with the function reshape():
dt <- read.table(header = TRUE, text = "
Area Scenario Type Y2020 Y2021 Y2022
1 a EV 0.5 0.2 0.3
1 b EV 0.6 0.4 0.6
1 c EV 0.7 0.5 0.8
1 d EV 0.8 0.6 0.7
2 a EV 0.9 0.8 0.5
2 b EV 1.0 1.0 0.6
2 c EV 1.1 1.0 0.7
2 d EV 1.2 1.1 0.8
3 a EV 1.3 1.2 0.9
3 b EV 1.4 1.5 1.1
3 c EV 1.5 1.3 1.3
3 d EV 1.6 1.5 1.6
")
reshape(data = dt,
idvar = c("Area", "Type"),
v.names = c("Y2020", "Y2021", "Y2022"),
timevar = "Scenario",
direction = "wide")
#> Area Type Y2020.a Y2021.a Y2022.a Y2020.b Y2021.b Y2022.b Y2020.c Y2021.c
#> 1 1 EV 0.5 0.2 0.3 0.6 0.4 0.6 0.7 0.5
#> 5 2 EV 0.9 0.8 0.5 1.0 1.0 0.6 1.1 1.0
#> 9 3 EV 1.3 1.2 0.9 1.4 1.5 1.1 1.5 1.3
#> Y2022.c Y2020.d Y2021.d Y2022.d
#> 1 0.8 0.8 0.6 0.7
#> 5 0.7 1.2 1.1 0.8
#> 9 1.3 1.6 1.5 1.6
# Created on 2021-02-01 by the reprex package (v0.3.0.9001)
Regards,
You need to convert the data to long format first and then to wide format
library(tidyverse)
Area <- c(1,1,1,1,2,2,2,2,3,3,3,3)
Scenario <- c("a", "b", "c", "d","a", "b", "c", "d","a", "b", "c", "d")
Type <- c("EV", "EV", "EV", "EV", "EV", "EV", "EV", "EV", "EV", "EV", "EV", "EV")
Y2020 <- c(0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6)
Y2021 <- c(0.2,0.4,0.5,0.6,0.8,1.0,1.0,1.1,1.2,1.5,1.3,1.5)
Y2022 <- c(0.3,0.6,0.2,0.7,0.5,0.6,0.7,0.8,0.9,1.1,1.3,1.6)
dt <- data.frame(Area,Type,Scenario, Y2020, Y2021, Y2022)
dt %>%
as_tibble() %>%
pivot_longer(-(1:3)) %>%
mutate(name = paste0(name, "_", Scenario)) %>%
select(-3) %>%
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 3 x 14
#> Area Type Y2020_a Y2021_a Y2022_a Y2020_b Y2021_b Y2022_b Y2020_c Y2021_c
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 EV 0.5 0.2 0.3 0.6 0.4 0.6 0.7 0.5
#> 2 2 EV 0.9 0.8 0.5 1 1 0.6 1.1 1
#> 3 3 EV 1.3 1.2 0.9 1.4 1.5 1.1 1.5 1.3
#> # … with 4 more variables: Y2022_c <dbl>, Y2020_d <dbl>, Y2021_d <dbl>,
#> # Y2022_d <dbl>
Created on 2021-02-01 by the reprex package (v0.3.0)
I am having trouble to reshape my data set to a panel data set. My df looks as follows
id s1 s2 s3 s4 ct1 ct2 ret1 ret2 ret3 ret4
1 a b c d 0.5 0.5 0.6 0.7 0.8 0.5
2 c b a d 0.6 0.6 0.7 0.6 0.5 0.4
3 a c d b 0.7 0.7 0.7 0.8 0.2 0.1
I would like to reshape so it looks as follows
id s ct1 ct2 ret
1 a 0.5 0.5 0.6
1 b 0.5 0.5 0.7
1 c 0.5 0.5 0.8
1 d 0.5 0.5 0.5
2 a 0.6 0.6 0.5
2 b 0.6 0.6 0.6
2 c 0.6 0.6 0.7
2 d 0.6 0.6 0.4
3 a 0.7 0.7 0.7
3 b 0.7 0.7 0.1
3 c 0.7 0.7 0.8
3 d 0.7 0.7 0.2
I regularly reshape from wide to long but somehow my head cannot get around this problem.
1) base R
An option using reshape
out <- reshape(
dat,
idvar = c("id", "ct1", "ct2"),
varying = c(outer(c("s", "ret"), 1:4, paste0)),
sep = "",
direction = "long"
)
Remove rownames and column time
rownames(out) <- out$time <- NULL
Result
out[order(out$id), ]
# id ct1 ct2 s ret
#1 1 0.5 0.5 a 0.6
#4 1 0.5 0.5 b 0.7
#7 1 0.5 0.5 c 0.8
#10 1 0.5 0.5 d 0.5
#2 2 0.6 0.6 c 0.7
#5 2 0.6 0.6 b 0.6
#8 2 0.6 0.6 a 0.5
#11 2 0.6 0.6 d 0.4
#3 3 0.7 0.7 a 0.7
#6 3 0.7 0.7 c 0.8
#9 3 0.7 0.7 d 0.2
#12 3 0.7 0.7 b 0.1
2) data.table
Using melt from data.table
library(data.table)
out <- melt(
setDT(dat),
id.vars = c("id", "ct1", "ct2"),
measure.vars = patterns(c("^s\\d", "^ret\\d")),
value.name = c("s", "ret")
)[, variable := NULL]
out
data
dat <- structure(list(id = 1:3, s1 = structure(c(1L, 2L, 1L), .Label = c("a",
"c"), class = "factor"), s2 = structure(c(1L, 1L, 2L), .Label = c("b",
"c"), class = "factor"), s3 = structure(c(2L, 1L, 3L), .Label = c("a",
"c", "d"), class = "factor"), s4 = structure(c(2L, 2L, 1L), .Label = c("b",
"d"), class = "factor"), ct1 = c(0.5, 0.6, 0.7), ct2 = c(0.5,
0.6, 0.7), ret1 = c(0.6, 0.7, 0.7), ret2 = c(0.7, 0.6, 0.8),
ret3 = c(0.8, 0.5, 0.2), ret4 = c(0.5, 0.4, 0.1)), .Names = c("id",
"s1", "s2", "s3", "s4", "ct1", "ct2", "ret1", "ret2", "ret3",
"ret4"), class = "data.frame", row.names = c(NA, -3L))
You could do it using spread and gather from the tidyr package. You will need to create a temporary id variable in order to be able to pivot the data:
library(dplyr)
library(tidyr)
df %>%
gather(key, value , -id, -ct1, -ct2) %>%
mutate(key = str_extract(key, "[:alpha:]+")) %>%
group_by(key) %>%
mutate(tmp_id = row_number()) %>%
ungroup() %>%
spread(key, value) %>%
select(id, s, ct1, ct2, ret)
Here is one way that the development version of tidyr (install with devtools::install_github("tidyverse/tidyr")) can make this a lot easier with pivot_longer. We make a spec indicating that the s columns should go into an s variable and similarly for the ret columns. You can remove the final obs column that indicates the number after s or ret if desired.
library(tidyverse)
tbl <- read_table2(
"id s1 s2 s3 s4 ct1 ct2 ret1 ret2 ret3 ret4
1 a b c d 0.5 0.5 0.6 0.7 0.8 0.5
2 c b a d 0.6 0.6 0.7 0.6 0.5 0.4
3 a c d b 0.7 0.7 0.7 0.8 0.2 0.1"
)
spec <- tibble(
`.name` = tbl %>% select(matches("^s|ret")) %>% colnames(),
`.value` = str_remove(`.name`, "\\d$"),
obs = str_extract(`.name`, "\\d")
)
tbl %>%
pivot_longer(spec = spec)
#> # A tibble: 12 x 6
#> id ct1 ct2 obs s ret
#> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
#> 1 1 0.5 0.5 1 a 0.6
#> 2 1 0.5 0.5 2 b 0.7
#> 3 1 0.5 0.5 3 c 0.8
#> 4 1 0.5 0.5 4 d 0.5
#> 5 2 0.6 0.6 1 c 0.7
#> 6 2 0.6 0.6 2 b 0.6
#> 7 2 0.6 0.6 3 a 0.5
#> 8 2 0.6 0.6 4 d 0.4
#> 9 3 0.7 0.7 1 a 0.7
#> 10 3 0.7 0.7 2 c 0.8
#> 11 3 0.7 0.7 3 d 0.2
#> 12 3 0.7 0.7 4 b 0.1
Created on 2019-07-23 by the reprex package (v0.3.0)
I have two data base, df and cf. I want to multiply each value of A in df by each coefficient in cf depending on the value of B and C in table df.
For example
row 2 in df A= 20 B= 4 and C= 2 so the correct coefficient is 0.3,
the result is 20*0.3 = 6
There is a simple way to do that in R!?
Thanks in advance!!
df
A B C
20 4 2
30 4 5
35 2 2
24 3 3
43 2 1
cf
C
B/C 1 2 3 4 5
1 0.2 0.3 0.5 0.6 0.7
2 0.1 0.5 0.3 0.3 0.4
3 0.9 0.1 0.6 0.6 0.8
4 0.7 0.3 0.7 0.4 0.6
One solution with apply:
#iterate over df's rows
apply(df, 1, function(x) {
x[1] * cf[x[2], x[3]]
})
#[1] 6.0 18.0 17.5 14.4 4.3
Try this vectorized:
df[,1] * cf[as.matrix(df[,2:3])]
#[1] 6.0 18.0 17.5 14.4 4.3
A solution using dplyr and a vectorised function:
df = read.table(text = "
A B C
20 4 2
30 4 5
35 2 2
24 3 3
43 2 1
", header=T, stringsAsFactors=F)
cf = read.table(text = "
0.2 0.3 0.5 0.6 0.7
0.1 0.5 0.3 0.3 0.4
0.9 0.1 0.6 0.6 0.8
0.7 0.3 0.7 0.4 0.6
")
library(dplyr)
# function to get the correct element of cf
# vectorised version
f = function(x,y) cf[x,y]
f = Vectorize(f)
df %>%
mutate(val = f(B,C),
result = val * A)
# A B C val result
# 1 20 4 2 0.3 6.0
# 2 30 4 5 0.6 18.0
# 3 35 2 2 0.5 17.5
# 4 24 3 3 0.6 14.4
# 5 43 2 1 0.1 4.3
The final dataset has both result and val in order to check which value from cf was used each time.