fill sequence of scaled numbers in r - r

I'm trying to complete a data.frame with scaled scores.
First I have a set of scores that relate to a grade, and a universal score that has been calculated.
library(dplyr)
df <- tibble(grade = c("X", "E", "D", "C", "B", "A", "Max"),
score = c(0,17,25,33,41,48,60),
universal = c(0,22,44,65,87,108,108))
I expand the frame to include all integer values of score
df %>% complete(score = full_seq(score, period = 1)) %>%
fill(grade, .direction = "down")
I now want to complete the universal score that relates to each integer score based on the relative steps between the previously defined universal scores for each grade.
This is based on a conversion/scaling factor:
(universal boundary for grade above - universal boundary below)/(score boundary grade above - score boundary grade below)
For the grade U this would be (22-0)/(17-0) = 1.29. Each previous score is summed with this factor to find the corresponding next universal score.
So the first part of the result should look like this:
score grade universal
0 U 0
1 U 1.29
2 U 2.59
3 U 3.88
4 U 5.18
5 U 6.47
6 U 7.76
7 U 9.06
8 U 10.35
9 U 11.65
10 U 12.94
11 U 14.24
12 U 15.53
13 U 16.82
14 U 18.12
15 U 19.41
16 U 20.71
17 N 22.00
I'm trying to achieve this with Tidy principles and various combinations of group_by(), complete(), seq(), etc., but haven't been able to achieve it in a neat way. I think my problem is that my max value is outside the grouping variable.
Any help will be much appreciated.

Base R has the approx function to do this linear interpolation. You can use it in a tidyverse context like this:
df %>%
complete(score = full_seq(score, period = 1)) %>%
fill(grade, .direction = "down") %>%
mutate(universal = approx(x=score,y=universal,xout=score)$y)
# A tibble: 61 × 3
score grade universal
<dbl> <chr> <dbl>
1 0 X 0
2 1 X 1.29
3 2 X 2.59
4 3 X 3.88
5 4 X 5.18
6 5 X 6.47
7 6 X 7.76
8 7 X 9.06
9 8 X 10.4
10 9 X 11.6

df %>% mutate(
inc = c(diff(universal) / diff(score), NA)
) %>%
complete(score = full_seq(score, period = 1)) %>%
fill(grade, inc, .direction = "down") %>%
group_by(grade) %>%
mutate(universal = first(universal) + (row_number() - 1) * inc) %>%
ungroup() %>%
print(n = 30)
# # A tibble: 61 × 4
# score grade universal inc
# <dbl> <chr> <dbl> <dbl>
# 1 0 X 0 1.29
# 2 1 X 1.29 1.29
# 3 2 X 2.59 1.29
# 4 3 X 3.88 1.29
# 5 4 X 5.18 1.29
# 6 5 X 6.47 1.29
# 7 6 X 7.76 1.29
# 8 7 X 9.06 1.29
# 9 8 X 10.4 1.29
# 10 9 X 11.6 1.29
# 11 10 X 12.9 1.29
# 12 11 X 14.2 1.29
# 13 12 X 15.5 1.29
# 14 13 X 16.8 1.29
# 15 14 X 18.1 1.29
# 16 15 X 19.4 1.29
# 17 16 X 20.7 1.29
# 18 17 E 22 2.75
# 19 18 E 24.8 2.75
# 20 19 E 27.5 2.75
# 21 20 E 30.2 2.75
# 22 21 E 33 2.75
# 23 22 E 35.8 2.75
# 24 23 E 38.5 2.75
# 25 24 E 41.2 2.75
# 26 25 D 44 2.62
# 27 26 D 46.6 2.62
# 28 27 D 49.2 2.62
# 29 28 D 51.9 2.62
# 30 29 D 54.5 2.62
# # … with 31 more rows
# # ℹ Use `print(n = ...)` to see more rows

Related

Particular ratio using dplyr and tidyr

I'd like to create a new velocity variable. In my data set:
library(dplyr)
library(tidyr)
day <- c(0,47,76,118,160,193,227,262,306,355,396,450)
AT <- c(0.14,0.48,0.83,0.83,0.94,0.94,0.94,0.94,0.94,11.93,12.81,29.36)
ClassType <- c("Class_0_1","Class_0_1","Class_0_1","Class_0_1","Class_0_1","Class_0_1",
"Class_0_1","Class_0_1","Class_0_1","Class_9_25","Class_9_25","Class_25_50")
ClassMax <-c(1,1,1,1,1,1,1,1,1,25,25,50)
my.ds <- data.frame(day,AT,ClassType,ClassMax)
my.ds
# day AT ClassType ClassMax
# 1 0 0.14 Class_0_1 1
# 2 47 0.48 Class_0_1 1
# 3 76 0.83 Class_0_1 1
# 4 118 0.83 Class_0_1 1
# 5 160 0.94 Class_0_1 1
# 6 193 0.94 Class_0_1 1
# 7 227 0.94 Class_0_1 1
# 8 262 0.94 Class_0_1 1
# 9 306 0.94 Class_0_1 1
# 10 355 11.93 Class_9_25 25
# 11 396 12.81 Class_9_25 25
# 12 450 29.36 Class_25_50 50
If ClassType changes, take the next AT value minus actual ClassType values and divide by the difference between the two correspondent dates. In my case:
(11.93 0.94) / (355-306)
#[1] 0.2242857
(12.81-11.93) / (396-355)
#[1] 0.02146341
(29.36-12.81) / (450-396)
#[1] 0.3064815
But if AT is in a new ClassType but do not change based in ClassMax then ignore it.
I have a min to max custom ordination complte.cases <- c("Class_0_1","Class_1_3","Class_3_9", "Class_9_25","Class_25_50","Class_50").
I'd like to repeat the last velocity value inside the intermediate absent ClassType.
I try to do without success:
my.ds$velocity <- c(0,diff(my.ds$AT))/c(0,diff(my.ds$day))
final.ds <- %>%
group_by(nest,ClassType)%>%
summarize(velocity=mean(velocity)) %>%
complete(ClassType, tidyr:fill = list(velocity = NA)) %>%
fill(velocity, .direction = "downup")
}
My desirable output must to be:
final.ds
# ClassType velocity
# Class_ 0_1 0.224285714
# Class_ 1_3 0.224285714
# Class_ 3_9 0.224285714
# Class_ 9_25 0.224285714
# Class_ 9_25 0.021463415
# Class_ 9_25 0.306481481
Please, any help with it?
How about this:
my.ds %>%
group_by(ClassType) %>%
mutate(velocity = c(NA, diff(AT) / diff(day))) %>%
ungroup()
# # A tibble: 12 x 5
# day AT ClassType ClassMax velocity
# <dbl> <dbl> <chr> <dbl> <dbl>
# 1 0 0.14 Class_0_1 1 NA
# 2 47 0.48 Class_0_1 1 0.00723
# 3 76 0.83 Class_0_1 1 0.0121
# 4 118 0.83 Class_0_1 1 0
# 5 160 0.94 Class_0_1 1 0.00262
# 6 193 0.94 Class_0_1 1 0
# 7 227 0.94 Class_0_1 1 0
# 8 262 0.94 Class_0_1 1 0
# 9 306 0.94 Class_0_1 1 0
# 10 355 11.9 Class_9_25 25 NA
# 11 396 12.8 Class_9_25 25 0.0215
# 12 450 29.4 Class_25_50 50 NA
complete.cases <- c("Class_0_1","Class_1_3","Class_3_9", "Class_9_25","Class_25_50")
my.ds %>% group_by(ClassType = factor(ClassType, levels = complete.cases), grp = lag(match(ClassType, unique(ClassType)), default = 1)) %>% slice_tail(n = 1) %>%
ungroup %>%summarise(ClassType, velocity = c(NA, diff(AT))/c(NA, diff(day))) %>%
complete(ClassType) %>%
fill(velocity, .direction = "updown")
# ClassType velocity
# <fct> <dbl>
# 1 Class_0_1 0.224
# 2 Class_1_3 0.224
# 3 Class_3_9 0.224
# 4 Class_9_25 0.224
# 5 Class_9_25 0.0215
# 6 Class_25_50 0.306

clarification in nycflight2013

Tried to find out avg delay in arrival for the nycflight2013 data set
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay))
but result showing NA except one
As said in the comments, you need to set na.rm = TRUE in your mean function. You can use the following code:
library(nycflights13)
library(tidyverse)
flights %>%
group_by(carrier) %>%
summarize(avg_dep_delay=mean(arr_delay, na.rm = TRUE))
Output:
# A tibble: 16 × 2
carrier avg_dep_delay
<chr> <dbl>
1 9E 7.38
2 AA 0.364
3 AS -9.93
4 B6 9.46
5 DL 1.64
6 EV 15.8
7 F9 21.9
8 FL 20.1
9 HA -6.92
10 MQ 10.8
11 OO 11.9
12 UA 3.56
13 US 2.13
14 VX 1.76
15 WN 9.65
16 YV 15.6

R:dplyr summarise data by group with nth() call with variable n calculated during aggregation

I'm aggregating data with variable bin sizes (see previous question here: R: aggregate every n rows with variable n depending on sum(n) of second column). In addition to calculating sums and means over groups of variable ranges, I need to pull out single-value covariates at the midpoint of each group range. When I try to do this on the fly, I only get a value for the first group and NAs for the remaining.
df.summary<-as.data.frame(df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn)+(last(rn)-first(rn))/2-1))))
head(df.summary)
grp x d i.start i.end y
1 1 0.07458317 88.99342 1 4 19.78992
2 2 0.07594546 97.62130 5 8 NA
3 3 0.05353308 104.69683 9 12 NA
4 4 0.06498291 106.23468 13 16 NA
5 5 0.08601759 98.24939 17 20 NA
6 6 0.06262427 84.43745 21 23 NA
sample data:
structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404,
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281,
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035,
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807,
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579,
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947,
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737,
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667,
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579,
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468,
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298,
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702,
0.0292878782807018, 0.0432093036666667, 0.0203547443684211),
d = c(22.2483512600033, 22.2483529247042, 22.2483545865809,
22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953,
25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058,
27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942,
24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705,
24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053,
28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115,
27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046,
27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837,
28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651,
28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023,
26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853,
27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001,
19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468,
19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886,
19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254,
19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642,
19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705,
19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486,
19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922,
19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352,
19.785256)), row.names = c(NA, 50L), class = "data.frame")
Let's add variable z and n in summarise part. Those variables are defined as below.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
z = round(first(rn)+(last(rn)-first(rn))/2-1),
n = n())
grp x d i.start i.end z n
<dbl> <dbl> <dbl> <int> <int> <dbl> <int>
1 1 0.0746 89.0 1 4 2 4
2 2 0.0759 97.6 5 8 6 4
3 3 0.0535 105. 9 12 10 4
4 4 0.0650 106. 13 16 14 4
5 5 0.0860 98.2 17 20 18 4
6 6 0.0626 84.4 21 23 21 3
7 7 0.0479 112. 24 27 24 4
8 8 0.0394 83.5 28 30 28 3
9 9 0.0706 110. 31 34 32 4
10 10 0.0575 112. 35 38 36 4
11 11 0.0647 83.0 39 41 39 3
12 12 0.0659 108. 42 45 42 4
13 13 0.0854 111. 46 49 46 4
14 14 0.0204 27.9 50 50 49 1
In dataframe above, n indicates sample size of each groups separated by grp. However, as you state group_by(grp), when you call nth(y, z), YOU WILL CALL Z-TH VALUE BY GROUP.
It means that for 5th group, although there exists only 4 values, you call 18th value of y. So it prints NA.
To get this easy, the most simple way I think is use n().
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
y=nth(y, round(n()/2)))
grp x d i.start i.end y
<dbl> <dbl> <dbl> <int> <int> <dbl>
1 1 0.0746 89.0 1 4 19.8
2 2 0.0759 97.6 5 8 19.8
3 3 0.0535 105. 9 12 19.8
4 4 0.0650 106. 13 16 19.8
5 5 0.0860 98.2 17 20 19.8
6 6 0.0626 84.4 21 23 19.8
7 7 0.0479 112. 24 27 19.8
8 8 0.0394 83.5 28 30 19.8
9 9 0.0706 110. 31 34 19.8
10 10 0.0575 112. 35 38 19.8
11 11 0.0647 83.0 39 41 19.8
12 12 0.0659 108. 42 45 19.8
13 13 0.0854 111. 46 49 19.8
14 14 0.0204 27.9 50 50 NA
You'll call floor(n/2)th y, which means y that locates middle of each group. Note that you can also try floor(n/2)+1.
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d = sum(d, na.rm=T),
i.start=first(rn),
i.end=last(rn),
y = nth(y, floor(median(rn)) - i.start))

Purrr Multiply index data frame with dataframe

Thank you all for reading this problem.
What i would like to do is multiply my testdata with my index file while matching columns.
So multiplying Dp_water with Dp_water and iterating over all index vars kcal, fat, prot, carbs.
In my test data i have for 10 individuals data on consumption of 4 food groups in grams.
for each individual i would like to calculate the kcal fat prot carb intake.
For each individual i would like to make a new variable
Dp_water_kcal, Dp_coffee_kcal, Dp_soup_kcal , Dp_soda_kcal
Dp_water_fat, Dp_coffee_fat, Dp_soup_fat , Dp_soda_fat
ect...
library(tidyverse)
Sample data
Index file
index <- data.frame(Variable=c("Dp_water","Dp_coffee","Dp_soup","Dp_soda"),
kcal=c(0,10,20,40),
fat=c(0,5,10,15),
prot=c(2,4,6,8),
carbs=c(3,6,9,12))
index <- index %>%
pivot_longer(c(kcal,fat,prot,carbs)) %>%
pivot_wider(names_from = Variable, values_from = value)
> index
# A tibble: 4 x 5
name Dp_water Dp_coffee Dp_soup Dp_soda
<chr> <dbl> <dbl> <dbl> <dbl>
1 kcal 0 10 20 40
2 fat 0 5 10 15
3 prot 2 4 6 8
4 carbs 3 6 9 12
Below subject data consumption of 4 foodgroups.
test_data <- data.frame(Dp_water=c(11:20),
Dp_coffee=c(31:40),
Dp_soup=c(21:30),
Dp_soda=c(41:50),
id=1:10)
Dp_water Dp_coffee Dp_soup Dp_soda id
1 11 31 21 41 1
2 12 32 22 42 2
3 13 33 23 43 3
4 14 34 24 44 4
5 15 35 25 45 5
6 16 36 26 46 6
7 17 37 27 47 7
8 18 38 28 48 8
9 19 39 29 49 9
10 20 40 30 50 10
If i do the following it works. But i would like to do this for all variables and not only kcal. And i would like to be able to keep the id column.
test_data %>%
select(-id) %>%
map2_dfr(., test_data[match(names(.), names(test_data))], ~.x/100 * .y) %>%
set_names(paste0(names(.), "_kcal"))
# A tibble: 10 x 4
Dp_water_kcal Dp_coffee_kcal Dp_soup_kcal Dp_soda_kcal
<dbl> <dbl> <dbl> <dbl>
1 1.21 9.61 4.41 16.8
2 1.44 10.2 4.84 17.6
3 1.69 10.9 5.29 18.5
4 1.96 11.6 5.76 19.4
5 2.25 12.2 6.25 20.2
6 2.56 13.0 6.76 21.2
7 2.89 13.7 7.29 22.1
8 3.24 14.4 7.84 23.0
9 3.61 15.2 8.41 24.0
10 4 16 9 25
Thank you all for any help!

Using "first" in mutate

My dataframe looks something like the first four columns of the following:
ID Obs Seconds Mean Ratio
<chr> <dbl> <dbl> <dbl> <dbl>
1 1815522 1 1 NA 1/10.6
2 1815522 2 26 NA 26/10.6
3 1815522 3 4.68 10.6 4.68/10.6
4 1815522 4 0 10.2 0/10.6
5 1815522 5 1.5 2.06 1.5/10.6
6 1815522 6 2.22 1.24 2.22/10.6
7 1815676 1 12 NA 12/9.67
8 1815676 2 6 NA 6/9.67
9 1815676 3 11 9.67 11/9.67
10 1815676 4 1 6 1/9.67
11 1815676 5 30 14 30/9.67
12 1815676 6 29 20 29/9.67
13 1815676 7 23 27.3 23/9.67
14 1815676 8 51 34.3 51/9.67
I am trying to add a fifth column "Ratio", containing the ratio of each row's value for Seconds, and the ID-group's first not-NA value of Mean. How do I do that?
I've tried several things:
temp %>%
group_by(ID) %>%
mutate(Ratio = case_when(all(is.na(Mean)) ~ NA_real_,
!all(is.na(Mean)) ~ Seconds/(first(Mean[!is.na(Mean)]))))
This gives me the following error:
Error in mutate_impl(.data, dots) :
Column `Ratio` must be length 2 (the group size) or one, not 0
I also tried
temp %>%
group_by(ID) %>%
mutate(Ratio = ifelse(!all(is.na(Mean)), Seconds/(first(Mean[!is.na(Mean)])), NA_real_))
But in this case, it will create a column that looks like this:
Ratio
<dbl>
1 0.0947
2 0.0947
3 0.0947
4 0.0947
5 0.0947
6 0.0947
7 1.24
8 1.24
9 1.24
10 1.24
11 1.24
12 1.24
13 1.24
14 1.24
I really don't know what else to try. Please help! :)
An idea is to use fill with .direction = 'up' since you are interested in the first value, to fill your NAs and simply divide with the first value. No need for case_when to capture all NAs since it will by default give NA as an answer, i.e.
library(tidyverse)
df %>%
group_by(ID) %>%
fill(Mean, .direction = 'up') %>%
mutate(ratio = Seconds / first(Mean))
which gives,
# A tibble: 14 x 5
# Groups: ID [2]
ID Obs Seconds Mean ratio
<int> <int> <dbl> <dbl> <dbl>
1 1815522 1 1 10.6 0.0943
2 1815522 2 26 10.6 2.45
3 1815522 3 4.68 10.6 0.442
4 1815522 4 0 10.2 0
5 1815522 5 1.5 2.06 0.142
6 1815522 6 2.22 1.24 0.209
7 1815676 1 12 9.67 1.24
8 1815676 2 6 9.67 0.620
9 1815676 3 11 9.67 1.14
10 1815676 4 1 6 0.103
11 1815676 5 30 14 3.10
12 1815676 6 29 20 3.00
13 1815676 7 23 27.3 2.38
14 1815676 8 51 34.3 5.27
Try this:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(
isNA = mean(is.na(Mean)),
Ratio = if_else(isNA == 1, NA_real_, Seconds / first(Mean[!is.na(Mean)]))
)

Resources