Converting and minimizing a data.frame in an efficient way - r

What is the fastest way to convert my data into my desired output?
m="
n min max median q1 q3 group m SD
1 30 30 55 44.5 43.25 49.75 treat 45.38524 5.593169
2 30 31 55 47.0 44.00 49.00 treat 46.11951 4.886821
3 30 40 55 48.0 45.00 51.00 treat 47.92676 4.173242
4 15 30 51 44.0 42.50 45.50 treat 43.21604 4.245150
5 15 31 54 46.0 42.50 48.50 treat 44.94723 5.759449
6 15 44 55 48.0 45.00 48.50 treat 47.66393 3.012334
7 15 39 55 49.0 44.00 52.00 treat 48.01439 5.571240
8 15 41 55 48.0 44.50 50.00 treat 47.59677 4.261415
9 15 40 55 47.0 45.00 50.00 treat 47.38081 4.200670
10 18 42 55 46.0 44.00 49.50 cont 46.91764 3.996259
11 18 40 55 44.0 43.00 47.00 cont 45.25704 3.667377
12 18 41 55 44.5 44.00 50.00 cont 46.58674 4.334604
13 9 42 49 46.0 43.00 48.00 cont 45.60879 3.357931
14 9 42 48 44.0 43.00 45.00 cont 44.29745 1.878592
15 9 41 55 44.0 44.00 45.00 cont 45.43229 2.779801
16 9 43 55 50.0 44.00 52.00 cont 48.73261 5.506545
17 9 43 55 46.0 44.00 51.00 cont 47.61981 5.069204
18 9 41 55 50.0 44.00 51.00 cont 48.19267 5.403842"
data <- read.table(text=m, h=T)
Desired_Output="
nT mT sdT nC mC sdC
30 45.38524 5.593169 18 46.91764 3.996259
30 46.11951 4.886821 18 45.25704 3.667377
30 47.92676 4.173242 18 46.58674 4.334604
. . . . . .
. . . . . .
. . . . . .
15 47.38081 4.200670 9 48.19267 5.403842"

Here is a base R solution,
l2 <- split(data[c('n', 'm', 'SD')], data$group)
do.call(cbind, Map(function(x, y){names(x) <- paste0(names(x), '_', y); x}, l2, names(l2)))
# cont.n_cont cont.m_cont cont.SD_cont treat.n_treat treat.m_treat treat.SD_treat
#10 18 46.91764 3.996259 30 45.38524 5.593169
#11 18 45.25704 3.667377 30 46.11951 4.886821
#12 18 46.58674 4.334604 30 47.92676 4.173242
#13 9 45.60879 3.357931 15 43.21604 4.245150
#14 9 44.29745 1.878592 15 44.94723 5.759449
#15 9 45.43229 2.779801 15 47.66393 3.012334
#16 9 48.73261 5.506545 15 48.01439 5.571240
#17 9 47.61981 5.069204 15 47.59677 4.261415
#18 9 48.19267 5.403842 15 47.38081 4.200670

It's little messy, but you may try using dplyr
library(dplyr)
data <- data %>%
group_by(group) %>%
mutate(idx = 1:n()) %>%
select(group, n, idx, m, SD) %>%
ungroup
df1 <- data %>%
filter(group == "treat") %>%
select(-group)
df2 <- data %>%
filter(group == "cont") %>%
select(-group)
df <- df1 %>%
left_join(df2, by = "idx") %>%
select(-idx)
names(df) <- c("nT", "mT", "sdT", "nC", "mC", "sdC")
df
nT mT sdT nC mC sdC
<int> <dbl> <dbl> <int> <dbl> <dbl>
1 30 45.4 5.59 18 46.9 4.00
2 30 46.1 4.89 18 45.3 3.67
3 30 47.9 4.17 18 46.6 4.33
4 15 43.2 4.25 9 45.6 3.36
5 15 44.9 5.76 9 44.3 1.88
6 15 47.7 3.01 9 45.4 2.78
7 15 48.0 5.57 9 48.7 5.51
8 15 47.6 4.26 9 47.6 5.07
9 15 47.4 4.20 9 48.2 5.40
Or using tidyr::pivot_wider,
library(tidyr)
data %>%
group_by(group) %>%
mutate(idx = 1:n()) %>%
select(group, n, idx, m, SD) %>%
ungroup %>%
pivot_wider(id_cols = idx, values_from = c(n, m, SD), names_from = group) %>%
select(-idx)
n_treat n_cont m_treat m_cont SD_treat SD_cont
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 30 18 45.4 46.9 5.59 4.00
2 30 18 46.1 45.3 4.89 3.67
3 30 18 47.9 46.6 4.17 4.33
4 15 9 43.2 45.6 4.25 3.36
5 15 9 44.9 44.3 5.76 1.88
6 15 9 47.7 45.4 3.01 2.78
7 15 9 48.0 48.7 5.57 5.51
8 15 9 47.6 47.6 4.26 5.07
9 15 9 47.4 48.2 4.20 5.40

New answer, dplyr only solution:
library(dplyr)
df %>%
select(n, m, SD, group) %>%
slice(10:18) %>%
rename_with(~ paste0(.x, "C")) %>%
bind_cols(df[1:9, ]) %>%
rename_with( ~ paste0(.x, "T"), c(n, m, SD)) %>%
select(nT, mT, sdT=SDT, nC, mC, sdC=SDC)
nT mT sdT nC mC sdC
10 30 45.38524 5.593169 18 46.91764 3.996259
11 30 46.11951 4.886821 18 45.25704 3.667377
12 30 47.92676 4.173242 18 46.58674 4.334604
13 15 43.21604 4.245150 9 45.60879 3.357931
14 15 44.94723 5.759449 9 44.29745 1.878592
15 15 47.66393 3.012334 9 45.43229 2.779801
16 15 48.01439 5.571240 9 48.73261 5.506545
17 15 47.59677 4.261415 9 47.61981 5.069204
18 15 47.38081 4.200670 9 48.19267 5.403842

Related

Calculate Percentage by Group with multiple columns in R

I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))

R:dplyr summarise data by group with nth() call with variable n calculated during aggregation

I'm aggregating data with variable bin sizes (see previous question here: R: aggregate every n rows with variable n depending on sum(n) of second column). In addition to calculating sums and means over groups of variable ranges, I need to pull out single-value covariates at the midpoint of each group range. When I try to do this on the fly, I only get a value for the first group and NAs for the remaining.
df.summary<-as.data.frame(df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn)+(last(rn)-first(rn))/2-1))))
head(df.summary)
grp x d i.start i.end y
1 1 0.07458317 88.99342 1 4 19.78992
2 2 0.07594546 97.62130 5 8 NA
3 3 0.05353308 104.69683 9 12 NA
4 4 0.06498291 106.23468 13 16 NA
5 5 0.08601759 98.24939 17 20 NA
6 6 0.06262427 84.43745 21 23 NA
sample data:
structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404,
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281,
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035,
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807,
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579,
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947,
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737,
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667,
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579,
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468,
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298,
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702,
0.0292878782807018, 0.0432093036666667, 0.0203547443684211),
d = c(22.2483512600033, 22.2483529247042, 22.2483545865809,
22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953,
25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058,
27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942,
24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705,
24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053,
28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115,
27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046,
27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837,
28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651,
28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023,
26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853,
27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001,
19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468,
19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886,
19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254,
19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642,
19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705,
19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486,
19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922,
19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352,
19.785256)), row.names = c(NA, 50L), class = "data.frame")
Let's add variable z and n in summarise part. Those variables are defined as below.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
z = round(first(rn)+(last(rn)-first(rn))/2-1),
n = n())
grp x d i.start i.end z n
<dbl> <dbl> <dbl> <int> <int> <dbl> <int>
1 1 0.0746 89.0 1 4 2 4
2 2 0.0759 97.6 5 8 6 4
3 3 0.0535 105. 9 12 10 4
4 4 0.0650 106. 13 16 14 4
5 5 0.0860 98.2 17 20 18 4
6 6 0.0626 84.4 21 23 21 3
7 7 0.0479 112. 24 27 24 4
8 8 0.0394 83.5 28 30 28 3
9 9 0.0706 110. 31 34 32 4
10 10 0.0575 112. 35 38 36 4
11 11 0.0647 83.0 39 41 39 3
12 12 0.0659 108. 42 45 42 4
13 13 0.0854 111. 46 49 46 4
14 14 0.0204 27.9 50 50 49 1
In dataframe above, n indicates sample size of each groups separated by grp. However, as you state group_by(grp), when you call nth(y, z), YOU WILL CALL Z-TH VALUE BY GROUP.
It means that for 5th group, although there exists only 4 values, you call 18th value of y. So it prints NA.
To get this easy, the most simple way I think is use n().
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
y=nth(y, round(n()/2)))
grp x d i.start i.end y
<dbl> <dbl> <dbl> <int> <int> <dbl>
1 1 0.0746 89.0 1 4 19.8
2 2 0.0759 97.6 5 8 19.8
3 3 0.0535 105. 9 12 19.8
4 4 0.0650 106. 13 16 19.8
5 5 0.0860 98.2 17 20 19.8
6 6 0.0626 84.4 21 23 19.8
7 7 0.0479 112. 24 27 19.8
8 8 0.0394 83.5 28 30 19.8
9 9 0.0706 110. 31 34 19.8
10 10 0.0575 112. 35 38 19.8
11 11 0.0647 83.0 39 41 19.8
12 12 0.0659 108. 42 45 19.8
13 13 0.0854 111. 46 49 19.8
14 14 0.0204 27.9 50 50 NA
You'll call floor(n/2)th y, which means y that locates middle of each group. Note that you can also try floor(n/2)+1.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d = sum(d, na.rm=T),
i.start=first(rn),
i.end=last(rn),
y = nth(y, floor(median(rn)) - i.start))

Looping linear regression output in a data frame in r

I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636

Looping over Dataframes and Columns in R

I have a dataframe with this structure:
df <- read.table(text="
site date v1 v2 v3 v4
a 2019-08-01 0 17 94 150
b 2019-08-01 5 25 83 148
c 2019-08-01 6 39 43 148
d 2019-08-01 10 39 144 165
a 2019-03-31 4 15 106 154
b 2019-03-31 4 21 70 151
c 2019-03-31 8 30 44 148
d 2019-03-31 9 41 144 160
a 2019-01-04 3 10 104 153
b 2019-01-04 2 16 90 150
c 2019-01-04 8 40 62 151
d 2019-01-04 9 43 142 162
a 2019-07-07 3 14 93 152
b 2019-07-07 2 23 74 147
c 2019-07-07 9 31 58 147
d 2019-07-07 9 36 123 170
a 2019-06-17 0 12 91 153
b 2019-06-17 3 25 73 147
c 2019-06-17 7 35 45 146
d 2019-06-17 8 40 134 168
a 2019-01-11 4 14 104 153
b 2019-01-11 5 18 73 151
c 2019-01-11 7 35 65 147
d 2019-01-11 11 44 134 168
a 2019-11-11 4 20 103 152
b 2019-11-11 6 22 79 152
c 2019-11-11 5 38 52 147
d 2019-11-11 10 38 144 163
a 2019-09-06 3 13 102 155
b 2019-09-06 6 17 74 149
c 2019-09-06 9 32 45 146
d 2019-09-06 11 42 138 165
", header=TRUE, stringsAsFactors=FALSE)
Now, I would like to calculate the statistic (min, max, mean, median, sd) of the variables (v1 - v4) for each of the sites for a full year, only the summer and only the winter.
First I subsetted the data for the summer and winter using the following code:
df_summer <- selectByDate(df, month = c(4:9))
df_winter <- selectByDate(df, month = c(1,2,3,10,11,12))
Then I tried to build a loop for the season and then for the variables. For this i created two lists:
df_list <- list(df, df_summer, df_winter)
col_names <- c("v1", "v2", "v3", "v4")
which I then tried to implement in the loop:
for (i in seq_along(df_list)){
for (j in col_names[,i]){
[j]_[i] <- describeBy([i]$[,j], [i]$site)
[j]_[i] <- data.frame(matrix(unlist([j]_[i]), nrow=length([j]_[i]), byrow=T))
[j]_[i]$site <- c("Frau2", "MW", "Sys1", "Sys4")
[j]_[i]$season <- c([i], [i], [i], [i])
[j]_[i]$type <- c([j], [j], [j], [j])
}
}
But this did not work - I get the messages:
Error: unexpected '[' in:
"for (j in col_names[,i]){
["
Error: unexpected '[' in " ["
Error: unexpected '}' in " }"
I already used the loop-"workflow" to generate the data I wanted, but this was done with copy and paste in order to get the data quick and dirty. Now I would like to tidy up the code.
Do you have an Idea how I could make this work or what I am doing wrong?
Thank you!
Matthias
UPDATE
So I tried what ekoam suggested - thank you for that! - and the following problems occured.
In contrary to the comments I wrote below ekoam's answer, the error occurs with both datasets (the example one provided here and the actual one I'm using - I'm not sure whether I'm allowed to publish the dataset).
This is my used code and the error I got:
df <- read_excel("C:/###/###/###/Example_data.xlsx")
df <- subset(data_watersamples, site %in% c("a","b","c", "d"))
my_summary <-
. %>%
group_by(site) %>%
summarise_at(vars(
c(v1, v2, v3, v4),
list(min = min, max = max, mean = mean, median = median, sd = sd)
)) %>%
pivot_longer(-site, names_to = c("type", "stat"), names_sep = "_") %>%
pivot_wider(names_from = "stat")
summer <- as.integer(format.Date(df$date, "%m")) %in% 4:9
df_list <- list(full_year = df, summer = df[summer, ], winter = df[!summer, ])
lapply(df_list, my_summary)
and get this error:
Error: Must subset columns with a valid subscript vector.
x Subscript has the wrong type `list`.
i It must be numeric or character.
Run `rlang::last_error()` to see where the error occurred.
> rlang::last_error()
Error in `*tmp*`[[id - n]] :
attempt to select more than one element in integerOneIndex
Thanks for your help!
Matthias
As you want things to be tidy, how about this tidyverse approach to your problem?
library(dplyr)
library(tidyr)
my_summary <-
. %>%
group_by(site) %>%
summarise(across(
c(v1, v2, v3, v4),
list(min = min, max = max, mean = mean, median = median, sd = sd)
)) %>%
pivot_longer(-site, names_to = c("type", "stat"), names_sep = "_") %>%
pivot_wider(names_from = "stat")
summer <- as.integer(format.Date(df$date, "%m")) %in% 4:9
df_list <- list(full_year = df, summer = df[summer, ], winter = df[!summer, ])
lapply(df_list, my_summary)
Output
`summarise()` ungrouping output (override with `.groups` argument)
`summarise()` ungrouping output (override with `.groups` argument)
`summarise()` ungrouping output (override with `.groups` argument)
$full_year
# A tibble: 16 x 7
site type min max mean median sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a v1 0 4 2.62 3 1.69
2 a v2 10 20 14.4 14 3.07
3 a v3 91 106 99.6 102. 5.93
4 a v4 150 155 153. 153 1.49
5 b v1 2 6 4.12 4.5 1.64
6 b v2 16 25 20.9 21.5 3.52
7 b v3 70 90 77 74 6.63
8 b v4 147 152 149. 150. 1.92
9 c v1 5 9 7.38 7.5 1.41
10 c v2 30 40 35 35 3.78
11 c v3 43 65 51.8 48.5 8.84
12 c v4 146 151 148. 147 1.60
13 d v1 8 11 9.62 9.5 1.06
14 d v2 36 44 40.4 40.5 2.67
15 d v3 123 144 138. 140 7.38
16 d v4 160 170 165. 165 3.40
$summer
# A tibble: 16 x 7
site type min max mean median sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a v1 0 3 1.5 1.5 1.73
2 a v2 12 17 14 13.5 2.16
3 a v3 91 102 95 93.5 4.83
4 a v4 150 155 152. 152. 2.08
5 b v1 2 6 4 4 1.83
6 b v2 17 25 22.5 24 3.79
7 b v3 73 83 76 74 4.69
8 b v4 147 149 148. 148. 0.957
9 c v1 6 9 7.75 8 1.5
10 c v2 31 39 34.2 33.5 3.59
11 c v3 43 58 47.8 45 6.90
12 c v4 146 148 147. 146. 0.957
13 d v1 8 11 9.5 9.5 1.29
14 d v2 36 42 39.2 39.5 2.5
15 d v3 123 144 135. 136 8.85
16 d v4 165 170 167 166. 2.45
$winter
# A tibble: 16 x 7
site type min max mean median sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a v1 3 4 3.75 4 0.5
2 a v2 10 20 14.8 14.5 4.11
3 a v3 103 106 104. 104 1.26
4 a v4 152 154 153 153 0.816
5 b v1 2 6 4.25 4.5 1.71
6 b v2 16 22 19.2 19.5 2.75
7 b v3 70 90 78 76 8.83
8 b v4 150 152 151 151 0.816
9 c v1 5 8 7 7.5 1.41
10 c v2 30 40 35.8 36.5 4.35
11 c v3 44 65 55.8 57 9.60
12 c v4 147 151 148. 148. 1.89
13 d v1 9 11 9.75 9.5 0.957
14 d v2 38 44 41.5 42 2.65
15 d v3 134 144 141 143 4.76
16 d v4 160 168 163. 162. 3.40

Find first previous lower value for each value in dataframe column

Given the following dataframe :
set.seed(1)
my_df = data.frame(x = rep(words[1:5], 50) %>% sort(),
y = 1:250,
z = sample(seq(from = 30 , to = 90, by = 0.1), size = 250, replace = T))
my_df %>% head(30)
x y z
1 a 1 45.9
2 a 2 52.3
3 a 3 64.4
4 a 4 84.5
5 a 5 42.1
6 a 6 83.9
7 a 7 86.7
8 a 8 69.7
9 a 9 67.8
10 a 10 33.7
11 a 11 42.3
12 a 12 40.6
13 a 13 71.2
14 a 14 53.0
15 a 15 76.2
16 a 16 59.9
17 a 17 73.1
18 a 18 89.6
19 a 19 52.8
20 a 20 76.7
21 a 21 86.1
22 a 22 42.7
23 a 23 69.1
24 a 24 37.5
25 a 25 46.0
26 a 26 53.2
27 a 27 30.8
28 a 28 52.9
29 a 29 82.2
30 a 30 50.4
I would like to create the following column using dplyr mutate
for each value In column z show the row index of the first value in z which is lower.
For example:
for row 8 show 5
for row 22 show 12
I'm not sure how to do this using dplyr, but here is a data.table attempt using a self non-equi join
library(data.table)
setDT(my_df) %>% #convert to data.table
# Run a self non-equi join and find the closest lower value
.[., .N - which.max(rev(z < i.z)) + 1L, on = .(y <= y), by = .EACHI] %>%
# filter the cases where there are no such values
.[y != V1] %>%
# join the result back to the original data
my_df[., on = .(y), res := V1]
head(my_df, 22)
# x y z res
# 1: a 1 45.9 NA
# 2: a 2 52.3 1
# 3: a 3 64.4 2
# 4: a 4 84.5 3
# 5: a 5 42.1 NA
# 6: a 6 83.9 5
# 7: a 7 86.7 6
# 8: a 8 69.7 5
# 9: a 9 67.8 5
# 10: a 10 33.7 NA
# 11: a 11 42.3 10
# 12: a 12 40.6 10
# 13: a 13 71.2 12
# 14: a 14 53.0 12
# 15: a 15 76.2 14
# 16: a 16 59.9 14
# 17: a 17 73.1 16
# 18: a 18 89.6 17
# 19: a 19 52.8 12
# 20: a 20 76.7 19
# 21: a 21 86.1 20
# 22: a 22 42.7 12
I have managed to find a dplyr solution inspired
by a solution given to one of my other questions using rollapply
in this link.
set.seed(1)
my_df = data.frame(x = rep(words[1:5], 50) %>% sort(),
y = 1:250,
z = sample(seq(from = 30 , to = 90, by = 0.1), size = 250, replace = T))
my_df %>%
mutate(First_Lower_z_Backwards = row_number() - rollapply(z,
width = list((0:(-n()))),
FUN = function(x) which(x < x[1])[1] - 1,
fill = NA,
partial = T)) %>%
head(22)
x y z First_Lower_z_Backwards
1 a 1 45.9 NA
2 a 2 52.3 1
3 a 3 64.4 2
4 a 4 84.5 3
5 a 5 42.1 NA
6 a 6 83.9 5
7 a 7 86.7 6
8 a 8 69.7 5
9 a 9 67.8 5
10 a 10 33.7 NA
11 a 11 42.3 10
12 a 12 40.6 10
13 a 13 71.2 12
14 a 14 53.0 12
15 a 15 76.2 14
16 a 16 59.9 14
17 a 17 73.1 16
18 a 18 89.6 17
19 a 19 52.8 12
20 a 20 76.7 19
21 a 21 86.1 20
22 a 22 42.7 12

Resources