Looping linear regression output in a data frame in r - r

I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?

...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859

We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2

Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2

And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636

Related

clarification in nycflight2013

Tried to find out avg delay in arrival for the nycflight2013 data set
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay))
but result showing NA except one
As said in the comments, you need to set na.rm = TRUE in your mean function. You can use the following code:
library(nycflights13)
library(tidyverse)
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay, na.rm = TRUE))
Output:
# A tibble: 16 × 2
carrier avg_dep_delay
<chr> <dbl>
1 9E 7.38
2 AA 0.364
3 AS -9.93
4 B6 9.46
5 DL 1.64
6 EV 15.8
7 F9 21.9
8 FL 20.1
9 HA -6.92
10 MQ 10.8
11 OO 11.9
12 UA 3.56
13 US 2.13
14 VX 1.76
15 WN 9.65
16 YV 15.6

Loop to sum weekly rolling average

I am new to coding. I have a data set of daily stream flow averages over 20 years. Following is an example:
DATE FLOW
1 10/1/2001 88.2
2 10/2/2001 77.6
3 10/3/2001 68.4
4 10/4/2001 61.5
5 10/5/2001 55.3
6 10/6/2001 52.5
7 10/7/2001 49.7
8 10/8/2001 46.7
9 10/9/2001 43.3
10 10/10/2001 41.3
11 10/11/2001 39.3
12 10/12/2001 37.7
13 10/13/2001 35.8
14 10/14/2001 34.1
15 10/15/2001 39.8
I need to create a loop summing the previous 6 days as well as the current day (rolling weekly average), and print it to an array for the designated water year. I have already created an aggregate function to separate yearly average daily means into their designated water years.
# Separating dates into specific water years
wtr_yr <- function(dates, start_month=9)
# Convert dates into POSIXlt
POSIDATE = as.POSIXlt(NEW_DATE)
# Year offset
offset = ifelse(POSIDATE$mon >= start_month - 1, 1, 0)
# Water year
adj.year = POSIDATE$year + 1900 + offset
# Aggregating the water year function to take the mean
mean.FLOW=aggregate(data_set$FLOW,list(adj.year), mean)
It seems that it can be done much more easily.
But first I need to prepare a bit more data.
library(tidyverse)
library(lubridate)
df = tibble(
DATE = seq(mdy("1/1/2010"), mdy("12/31/2022"), 1),
FLOW = rnorm(length(DATE), 40, 10)
)
output
# A tibble: 4,748 x 2
DATE FLOW
<date> <dbl>
1 2010-01-01 34.4
2 2010-01-02 37.7
3 2010-01-03 55.6
4 2010-01-04 40.7
5 2010-01-05 41.3
6 2010-01-06 57.2
7 2010-01-07 44.6
8 2010-01-08 27.3
9 2010-01-09 33.1
10 2010-01-10 35.5
# ... with 4,738 more rows
Now let's do the aggregation by year and week number
df %>%
group_by(year(DATE), week(DATE)) %>%
summarise(mean = mean(FLOW))
output
# A tibble: 689 x 3
# Groups: year(DATE) [13]
`year(DATE)` `week(DATE)` mean
<dbl> <dbl> <dbl>
1 2010 1 44.5
2 2010 2 39.6
3 2010 3 38.5
4 2010 4 35.3
5 2010 5 44.1
6 2010 6 39.4
7 2010 7 41.3
8 2010 8 43.9
9 2010 9 38.5
10 2010 10 42.4
# ... with 679 more rows
Note, for the function week, the first week starts on January 1st. If you want to number the weeks according to the ISO 8601 standard, use the isoweek function. Alternatively, you can also use an epiweek compatible with the US CDC.
df %>%
group_by(year(DATE), isoweek(DATE)) %>%
summarise(mean = mean(FLOW))
output
# A tibble: 681 x 3
# Groups: year(DATE) [13]
`year(DATE)` `isoweek(DATE)` mean
<dbl> <dbl> <dbl>
1 2010 1 40.0
2 2010 2 45.5
3 2010 3 33.2
4 2010 4 38.9
5 2010 5 45.0
6 2010 6 40.7
7 2010 7 38.5
8 2010 8 42.5
9 2010 9 37.1
10 2010 10 42.4
# ... with 671 more rows
If you want to better understand how these functions work, please follow the code below
df %>%
mutate(
w1 = week(DATE),
w2 = isoweek(DATE),
w3 = epiweek(DATE)
)
output
# A tibble: 4,748 x 5
DATE FLOW w1 w2 w3
<date> <dbl> <dbl> <dbl> <dbl>
1 2010-01-01 34.4 1 53 52
2 2010-01-02 37.7 1 53 52
3 2010-01-03 55.6 1 53 1
4 2010-01-04 40.7 1 1 1
5 2010-01-05 41.3 1 1 1
6 2010-01-06 57.2 1 1 1
7 2010-01-07 44.6 1 1 1
8 2010-01-08 27.3 2 1 1
9 2010-01-09 33.1 2 1 1
10 2010-01-10 35.5 2 1 2
# ... with 4,738 more rows

Calculate Percentage by Group with multiple columns in R

I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))

R:dplyr summarise data by group with nth() call with variable n calculated during aggregation

I'm aggregating data with variable bin sizes (see previous question here: R: aggregate every n rows with variable n depending on sum(n) of second column). In addition to calculating sums and means over groups of variable ranges, I need to pull out single-value covariates at the midpoint of each group range. When I try to do this on the fly, I only get a value for the first group and NAs for the remaining.
df.summary<-as.data.frame(df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn)+(last(rn)-first(rn))/2-1))))
head(df.summary)
grp x d i.start i.end y
1 1 0.07458317 88.99342 1 4 19.78992
2 2 0.07594546 97.62130 5 8 NA
3 3 0.05353308 104.69683 9 12 NA
4 4 0.06498291 106.23468 13 16 NA
5 5 0.08601759 98.24939 17 20 NA
6 6 0.06262427 84.43745 21 23 NA
sample data:
structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404,
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281,
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035,
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807,
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579,
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947,
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737,
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667,
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579,
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468,
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298,
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702,
0.0292878782807018, 0.0432093036666667, 0.0203547443684211),
d = c(22.2483512600033, 22.2483529247042, 22.2483545865809,
22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953,
25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058,
27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942,
24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705,
24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053,
28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115,
27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046,
27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837,
28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651,
28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023,
26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853,
27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001,
19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468,
19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886,
19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254,
19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642,
19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705,
19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486,
19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922,
19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352,
19.785256)), row.names = c(NA, 50L), class = "data.frame")
Let's add variable z and n in summarise part. Those variables are defined as below.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
z = round(first(rn)+(last(rn)-first(rn))/2-1),
n = n())
grp x d i.start i.end z n
<dbl> <dbl> <dbl> <int> <int> <dbl> <int>
1 1 0.0746 89.0 1 4 2 4
2 2 0.0759 97.6 5 8 6 4
3 3 0.0535 105. 9 12 10 4
4 4 0.0650 106. 13 16 14 4
5 5 0.0860 98.2 17 20 18 4
6 6 0.0626 84.4 21 23 21 3
7 7 0.0479 112. 24 27 24 4
8 8 0.0394 83.5 28 30 28 3
9 9 0.0706 110. 31 34 32 4
10 10 0.0575 112. 35 38 36 4
11 11 0.0647 83.0 39 41 39 3
12 12 0.0659 108. 42 45 42 4
13 13 0.0854 111. 46 49 46 4
14 14 0.0204 27.9 50 50 49 1
In dataframe above, n indicates sample size of each groups separated by grp. However, as you state group_by(grp), when you call nth(y, z), YOU WILL CALL Z-TH VALUE BY GROUP.
It means that for 5th group, although there exists only 4 values, you call 18th value of y. So it prints NA.
To get this easy, the most simple way I think is use n().
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
y=nth(y, round(n()/2)))
grp x d i.start i.end y
<dbl> <dbl> <dbl> <int> <int> <dbl>
1 1 0.0746 89.0 1 4 19.8
2 2 0.0759 97.6 5 8 19.8
3 3 0.0535 105. 9 12 19.8
4 4 0.0650 106. 13 16 19.8
5 5 0.0860 98.2 17 20 19.8
6 6 0.0626 84.4 21 23 19.8
7 7 0.0479 112. 24 27 19.8
8 8 0.0394 83.5 28 30 19.8
9 9 0.0706 110. 31 34 19.8
10 10 0.0575 112. 35 38 19.8
11 11 0.0647 83.0 39 41 19.8
12 12 0.0659 108. 42 45 19.8
13 13 0.0854 111. 46 49 19.8
14 14 0.0204 27.9 50 50 NA
You'll call floor(n/2)th y, which means y that locates middle of each group. Note that you can also try floor(n/2)+1.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d = sum(d, na.rm=T),
i.start=first(rn),
i.end=last(rn),
y = nth(y, floor(median(rn)) - i.start))

Purrr Multiply index data frame with dataframe

Thank you all for reading this problem.
What i would like to do is multiply my testdata with my index file while matching columns.
So multiplying Dp_water with Dp_water and iterating over all index vars kcal, fat, prot, carbs.
In my test data i have for 10 individuals data on consumption of 4 food groups in grams.
for each individual i would like to calculate the kcal fat prot carb intake.
For each individual i would like to make a new variable
Dp_water_kcal, Dp_coffee_kcal, Dp_soup_kcal , Dp_soda_kcal
Dp_water_fat, Dp_coffee_fat, Dp_soup_fat , Dp_soda_fat
ect...
library(tidyverse)
Sample data
Index file
index <- data.frame(Variable=c("Dp_water","Dp_coffee","Dp_soup","Dp_soda"),
kcal=c(0,10,20,40),
fat=c(0,5,10,15),
prot=c(2,4,6,8),
carbs=c(3,6,9,12))
index <- index %>%
pivot_longer(c(kcal,fat,prot,carbs)) %>%
pivot_wider(names_from = Variable, values_from = value)
> index
# A tibble: 4 x 5
name Dp_water Dp_coffee Dp_soup Dp_soda
<chr> <dbl> <dbl> <dbl> <dbl>
1 kcal 0 10 20 40
2 fat 0 5 10 15
3 prot 2 4 6 8
4 carbs 3 6 9 12
Below subject data consumption of 4 foodgroups.
test_data <- data.frame(Dp_water=c(11:20),
Dp_coffee=c(31:40),
Dp_soup=c(21:30),
Dp_soda=c(41:50),
id=1:10)
Dp_water Dp_coffee Dp_soup Dp_soda id
1 11 31 21 41 1
2 12 32 22 42 2
3 13 33 23 43 3
4 14 34 24 44 4
5 15 35 25 45 5
6 16 36 26 46 6
7 17 37 27 47 7
8 18 38 28 48 8
9 19 39 29 49 9
10 20 40 30 50 10
If i do the following it works. But i would like to do this for all variables and not only kcal. And i would like to be able to keep the id column.
test_data %>%
select(-id) %>%
map2_dfr(., test_data[match(names(.), names(test_data))], ~.x/100 * .y) %>%
set_names(paste0(names(.), "_kcal"))
# A tibble: 10 x 4
Dp_water_kcal Dp_coffee_kcal Dp_soup_kcal Dp_soda_kcal
<dbl> <dbl> <dbl> <dbl>
1 1.21 9.61 4.41 16.8
2 1.44 10.2 4.84 17.6
3 1.69 10.9 5.29 18.5
4 1.96 11.6 5.76 19.4
5 2.25 12.2 6.25 20.2
6 2.56 13.0 6.76 21.2
7 2.89 13.7 7.29 22.1
8 3.24 14.4 7.84 23.0
9 3.61 15.2 8.41 24.0
10 4 16 9 25
Thank you all for any help!

Resources