I have a small dataset weekly_data of projects were working on, and anticipated time to be spent and duration in weeks for each of the two milestones, labeled CD and CA
# A tibble: 17 x 5
dsk_proj_number hrs_per_week_cd cd_dur_weeks hrs_per_week_ca ca_dur_weeks
<fct> <dbl> <dbl> <dbl> <dbl>
1 17061 0 0 2.43 28
2 18009 0 0 1.83 12
3 18029 0 0 2.83 24
4 19029 1.5 16 2.43 28
5 19050 0 0 2.8 20
6 20012 0 0 3.4 20
7 21016 3 8 2.43 28
8 21022 0 0 4.25 16
9 21050 0 0 3.4 20
10 21061a 17.5 24 15.8 52
11 21061b 1.5 4 7.5 8
12 21061c 7.67 12 5 12
13 21061d 0 0 0 0
14 21061e 8 1 3 1
15 21094 0 0 3 8
16 22027 0 0 0.75 8
17 22068 2.92 12 2.38 8
I want to get this into a format wheree, based on the cd_dur_weeks and ca_dur_weeks durations indicated, I have the estiamted number of hours by weeks, for all the weeks, like this:
> sched %>% head(15)
# A tibble: 15 x 17
`18009` `22068` `17061` `21050` `19029` `21016` `21022` `19050` `18029` `22027` `20012` `21094` `21061a` `21061b` `21061c` `21061d` `21061e`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 1.5 7.67 0 8
2 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 1.5 7.67 0 3
3 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 1.5 7.67 0 0
4 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 1.5 7.67 0 0
5 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 7.5 7.67 0 0
6 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 7.5 7.67 0 0
7 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 7.5 7.67 0 0
8 1.83 2.92 2.43 3.4 1.5 3 4.25 2.8 2.83 0.75 3.4 3 17.5 7.5 7.67 0 0
9 1.83 2.92 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 7.5 7.67 0 0
10 1.83 2.92 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 7.5 7.67 0 0
11 1.83 2.92 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 7.5 7.67 0 0
12 1.83 2.92 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 7.5 7.67 0 0
13 0 2.38 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 0 5 0 0
14 0 2.38 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 0 5 0 0
15 0 2.38 2.43 3.4 1.5 2.43 4.25 2.8 2.83 0 3.4 0 17.5 0 5 0 0
I was able to use pivot_wider() to make the project numbers the variable names, and each row an individual week, but was forced to use for()'s and if()'s. Seems like there should be an easier way to get this done.
Here's the code I used:
sched <- data.frame(dsk_proj_number = rezvan$dsk_proj_number)
sched$weeks <- NA
sched <- sched %>% pivot_wider(names_from = dsk_proj_number, values_from = weeks)
for(proj_num in weekly_data$dsk_proj_number){
duration_cd = weekly_data[which(weekly_data$dsk_proj_number == proj_num), "cd_dur_weeks"] %>% as.numeric
duration_ca = weekly_data[which(weekly_data$dsk_proj_number == proj_num), "ca_dur_weeks"] %>% as.numeric
if(duration_cd > 0) {
sched[1:duration_cd, proj_num] = weekly_data[which(weekly_data$dsk_proj_number == proj_num), "hrs_per_week_cd"]
}
if(duration_ca > 0) {
sched[duration_cd + 1:duration_ca, proj_num] = weekly_data[which(weekly_data$dsk_proj_number == proj_num), "hrs_per_week_ca"]
}
}
sched <- sched %>% mutate_all(coalesce, 0)
You can use rep() to repeat elements a certain number of times, and then use c() to concatenate them into a long sequence. I use rowwise from dplyr to conveniently do this row-by-row.
Then you can unnest the lists of vectors.
library(tidyverse)
sched <- weekly_data %>%
mutate(max_weeks = max(cd_dur_weeks + ca_dur_weeks)) %>%
rowwise() %>%
mutate(week = list(c(rep(hrs_per_week_cd, cd_dur_weeks), rep(hrs_per_week_ca, ca_dur_weeks), rep(0, max_weeks-cd_dur_weeks-ca_dur_weeks)))) %>%
ungroup() %>%
select(dsk_proj_number, week) %>%
pivot_wider(names_from = "dsk_proj_number", values_from = week) %>%
unnest(everything())
df %>%
select(1:3) %>%
slice(rep(1:nrow(.), cd_dur_weeks)) %>%
select(-3) %>%
mutate(milestone = 1) %>%
rename(hrs_per_week = hrs_per_week_cd) -> df1
df %>%
select(c(1,4,5)) %>%
slice(rep(1:nrow(.), ca_dur_weeks)) %>%
select(-3) %>%
mutate(milestone = 2) %>%
rename(hrs_per_week = hrs_per_week_ca) -> df2
rbind(df1, df2) %>%
arrange(dsk_proj_number, milestone) %>%
group_by(dsk_proj_number) %>%
mutate(week = seq_along(dsk_proj_number)) %>%
pivot_wider(id_cols=week, names_from=dsk_proj_number, values_from=hrs_per_week) %>%
replace(is.na(.), 0)
Related
Suppose we have the following data:
tib <- tibble::tibble(x = 1:10)
Then, suppose we want to make a function that takes a column as input and returns a tibble with several added columns such as:
library(dplyr)
generate_transformations <- function(data, column){
transform <- sym(column)
data %>%
mutate(
sqrt = sqrt(!!transform),
recip = 1 / !!transform,
log = log(!!transform)
)
}
# Usage is great:
tib %>%
generate_transformations('x')
# A tibble: 10 x 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Now my question is, is there a way to avoid unquoting (!!) transform repeatedly?
Yes, I could, e.g., temporarily rename column and then rename it back after I am done, but that is not my interest in this question.
I am interested if there is a way to produce a variable that does not need the !!.
While it does not work, I was looking for something like:
generate_transformations <- function(data, column){
transform <- !!sym(column) # cannot unquote here :(
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
Convert to string and subset from the data and use transform
generate_transformations <- function(data, column){
transform <- data[[rlang::as_string(ensym(column))]]
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Or create a temporary column and remove it later
generate_transformations <- function(data, column){
data %>%
mutate(transform = !! rlang::ensym(column),
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform),
transform = NULL
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
You can do it in one, if you swap !! for {{}} and use across:
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x x_sqrt x_log x_recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To restore your original column names, use
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs, .names="{.fn}"))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x sqrt log recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To handle multiple columns:
data_transformations <- function(d, cols, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{cols}}, .fns=funs))
}
d1 <- tibble(x=1:10, y=seq(2, 20, 2))
d1 %>% data_transformations(c(x, y), list(sqrt=sqrt, log=log))
A tibble: 10 × 6
x y x_sqrt x_log y_sqrt y_log
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 1 0 1.41 0.693
2 2 4 1.41 0.693 2 1.39
3 3 6 1.73 1.10 2.45 1.79
4 4 8 2 1.39 2.83 2.08
5 5 10 2.24 1.61 3.16 2.30
6 6 12 2.45 1.79 3.46 2.48
7 7 14 2.65 1.95 3.74 2.64
8 8 16 2.83 2.08 4 2.77
9 9 18 3 2.20 4.24 2.89
10 10 20 3.16 2.30 4.47 3.00
I maybe have a strange question...I have a dataframe as below:
Station Mean_length Diff
1 AMEL 28.1 -2.91
2 AMRU 21.1 -9.90
3 BALG 31.0 0
4 BORK 30.1 -0.921
5 BUSU 22.6 -8.38
6 CADZ 28.5 2.46
7 DOLL 27.9 -3.07
8 EGMO 28.3 -2.69
9 EIER 30.8 0.233
10 FANO 23.1 -7.89
Now from column "Diff" I want to get a new column and I want to add -0.5 to a value below 0 and add 0.5 to value above 0.
So I get a new dataframe like this:
Station Mean_length Diff Diff05
1 AMEL 28.1 -2.91 -3.41 (-0.5)
2 AMRU 21.1 -9.90 -13.8 (-0.5)
3 BALG 31.0 0 0.5 (+0.5)
4 BORK 30.1 -0.921 -1.421 (-0.5)
5 BUSU 22.6 -8.38 -8.88 (-0.5)
6 CADZ 28.5 2.46 2.96 (+0.5)
7 DOLL 27.9 -3.07 -3.57 (-0.5)
8 EGMO 28.3 -2.69 -3.19 (-0.5)
9 EIER 30.8 0.233 0.733 (+0.5)
10 FANO 23.1 -7.89 -8.39 (-0.5)
How can I tackle this? Is there something in dplyr possible? with the 'ifelse' function? recognizing values when they are haven the '-' in front of them....
Thank you I advance!
Another way:
df$Diff05 <- df$Diff + 0.5 * sign(df$Diff)
Station Mean_length Diff Diff05
1 AMEL 28.1 -2.910 -3.410
2 AMRU 21.1 -9.900 -10.400
3 BALG 31.0 0.000 0.000
4 BORK 30.1 -0.921 -1.421
5 BUSU 22.6 -8.380 -8.880
6 CADZ 28.5 2.460 2.960
7 DOLL 27.9 -3.070 -3.570
8 EGMO 28.3 -2.690 -3.190
9 EIER 30.8 0.233 0.733
10 FANO 23.1 -7.890 -8.390
You could also use df$Diff + (df$Diff>0) - 0.5
Does this work:
library(dplyr)
df %>% mutate(Diff05 = if_else(Diff < 0, Diff - 0.5, Diff + 0.5))
# A tibble: 10 x 4
station Mean_length Diff Diff05
<chr> <dbl> <dbl> <dbl>
1 AMEL 28.1 -2.91 -3.41
2 AMRU 21.1 -9.9 -10.4
3 BALG 31 0 0.5
4 BORK 30.1 -0.921 -1.42
5 BUSU 22.6 -8.38 -8.88
6 CADZ 28.5 2.46 2.96
7 DOLL 27.9 -3.07 -3.57
8 EGMO 28.3 -2.69 -3.19
9 EIER 30.8 0.233 0.733
10 FANO 23.1 -7.89 -8.39
The logical way
df$Diff05 <- ifelse(test = df$Diff < 0, yes = df$Diff - 0.5, no = df$Diff + 0.5)
This question already has answers here:
Summarizing multiple columns with dplyr? [duplicate]
(5 answers)
Closed 2 years ago.
I have this data
acic2
> acic2
# A tibble: 21 x 9
PCC V1.1 V2.2 V3.3 V4.4 V5.5 V6.6 V7.7 Vtotal
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 8.33 5.33 6 6.5 7.67 7 5.33 6.60
2 A 8.67 4.33 6.25 7 7.5 7 5.67 6.63
3 B 9 4.33 7 7.25 7.83 6.8 6.17 6.91
4 C 5.17 3.33 5.25 2.75 3.17 4 4.5 4.02
5 C 8 6 6.25 3.5 6.17 5.6 6 5.93
6 D 6.5 5.67 7.25 5.75 5.33 6.4 4 5.84
7 D 6 4.67 6 5.25 3.67 4.6 5 5.03
8 E 6.5 7 6 7 4.33 5.4 5.67 5.99
9 E 9 5.67 6.5 8 6.17 3.6 5 6.28
10 F 9.17 8 6.5 6.25 7 6.4 6.67 7.14
# ... with 11 more rows
>
I want to create a separate data set called acic3 by taking the average of columns with the same letters in PCC. So I'll get one row for every letter which contains the average score for each column.
You can group_by and summarize(across(...)) in dplyr
acic3 <- acic2 %>%
group_by(PCC) %>%
summarize(across(starts_with("V"), mean)
I am struggling with some data munging. To get to the table below I have used group_by and summarise_at to find the means of Q1-Q10 by cid and time (I started with multiple values for each cid and at each time point), then filtered down to just have cids that appear about both time 1 and 2. Using this (or going back to my raw data if there is a cleaner way) I want to count for each cid how many of the means of Q1-Q10 increased at time 2, then, for each GROUP mind the mean number of increases.
GROUP cid time Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10
A 169 1 4.45 4.09 3.91 3.73 3.82 4.27 3.55 4 4.55 3.91
A 169 2 4.56 4.15 4.06 3.94 4.09 4.53 3.91 3.97 4.12 4.21
A 184 1 4.64 4.18 3.45 3.64 3.82 4.55 3.91 4.27 4 3.55
A 184 2 3.9 3.6 3 3.6 3.4 3.9 3 3.5 3.2 3.1
B 277 1 4.43 4.21 3.64 4.36 4.36 4.57 4.36 4.29 4.07 4.07
B 277 2 4.11 4 3.56 3.44 3.67 4 3.89 3.78 3.44 3.89
...
I have seen examples using spread on iris data but this was for the difference on a single variable. Any help appreciated.
Try this. Gives you the mean increase by GROUP and Qs:
df <- read.table(text = "GROUP cid time Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10
A 169 1 4.45 4.09 3.91 3.73 3.82 4.27 3.55 4 4.55 3.91
A 169 2 4.56 4.15 4.06 3.94 4.09 4.53 3.91 3.97 4.12 4.21
A 184 1 4.64 4.18 3.45 3.64 3.82 4.55 3.91 4.27 4 3.55
A 184 2 3.9 3.6 3 3.6 3.4 3.9 3 3.5 3.2 3.1
B 277 1 4.43 4.21 3.64 4.36 4.36 4.57 4.36 4.29 4.07 4.07
B 277 2 4.11 4 3.56 3.44 3.67 4 3.89 3.78 3.44 3.89", header = TRUE)
library(dplyr)
library(tidyr)
df %>%
# Convert to long
pivot_longer(-c(GROUP, cid, time), names_to = "Q") %>%
# Group by GROUP, cid, Q
group_by(GROUP, cid, Q) %>%
# Just in case: sort by time
arrange(time) %>%
# Increased at time 2 using lag
mutate(is_increase = value > lag(value)) %>%
# Mean increase by GROUP and Q
group_by(GROUP, Q) %>%
summarise(mean_inc = mean(is_increase, na.rm = TRUE))
#> # A tibble: 20 x 3
#> # Groups: GROUP [2]
#> GROUP Q mean_inc
#> <fct> <chr> <dbl>
#> 1 A Q1 0.5
#> 2 A Q10 0.5
#> 3 A Q2 0.5
#> 4 A Q3 0.5
#> 5 A Q4 0.5
#> 6 A Q5 0.5
#> 7 A Q6 0.5
#> 8 A Q7 0.5
#> 9 A Q8 0
#> 10 A Q9 0
#> 11 B Q1 0
#> 12 B Q10 0
#> 13 B Q2 0
#> 14 B Q3 0
#> 15 B Q4 0
#> 16 B Q5 0
#> 17 B Q6 0
#> 18 B Q7 0
#> 19 B Q8 0
#> 20 B Q9 0
Created on 2020-04-12 by the reprex package (v0.3.0)
I have 2 data frames, the sampling ("samp") and the coordinates ("coor").
The "samp" data frame:
Plot X Y H L
1 6.4 0.6 3.654 0.023
1 19.1 9.3 4.998 0.023
1 2.4 4.2 5.568 0.024
1 16.1 16.7 5.32 0.074
1 10.8 15.8 6.58 0.026
1 1 16 4.968 0.023
1 9.4 12.4 6.804 0.078
2 3.6 0.4 4.3 0.038
3 12.2 19.9 7.29 0.028
3 2 18.2 7.752 0.028
3 6.5 19.9 7.2 0.028
3 3.7 13.8 5.88 0.042
3 4.9 10.3 9.234 0.061
3 3.7 13.8 5.88 0.042
3 4.9 10.3 9.234 0.061
4 16.3 2.4 5.18 0.02
4 15.7 9.8 10.92 0.096
4 6 12.6 6.96 0.16
5 19.4 16.4 8.2 0.092
10 4.8 5.16 7.38 1.08
11 14.7 16.2 16.44 0.89
11 19 19 10.2 0.047
12 10.8 2.7 19.227 1.2
14 0.6 6.4 12.792 0.108
14 4.6 1.9 12.3 0.122
15 12.2 18 9.6 0.034
16 13 18.3 4.55 0.021
The "coor" data frame:
Plot X Y
1 356154.007 501363.546
2 356154.797 501345.977
3 356174.697 501336.114
4 356226.469 501336.816
5 356255.24 501352.714
10 356529.313 501292.4
11 356334.895 501320.725
12 356593.271 501255.297
14 356350.029 501314.385
15 356358.81 501285.955
16 356637.29 501227.297
17 356652.157 501263.238
18 356691.68 501262.403
19 356755.386 501242.501
20 356813.735 501210.59
22 356980.118 501178.974
23 357044.996 501168.859
24 357133.365 501158.418
25 357146.781 501158.866
26 357172.485 501161.646
I wish to run "for loop" function to register the "samp" data frame with the GPS coordinates from the "coor" data frame -- e.g. the "new_x" variable is the sum output of "X" from the "samp" and the "coor" , under the same "Plot" IDs.
This is what i tried but not working.
for (i in 1:nrow(samp)){
if (samp$Plot[i]==coor$Plot[i]){
(samp$new_x[i]<-(coor$X[i] + samp$X[i]))
} else (samp$new_x[i]<-samp$X[i])
}
The final output i wish to have is with a proper coordinate variable ("new_x") created onto the "samp" data frame. It should looks like this:
Plot X Y H L new_x
1 6.4 0.6 3.654 0.023 356160.407
1 19.1 9.3 4.998 0.023 356173.107
1 2.4 4.2 5.568 0.024 356156.407
1 16.1 16.7 5.32 0.074 356170.107
1 10.8 15.8 6.58 0.026 356164.807
1 1 16 4.968 0.023 356155.007
1 9.4 12.4 6.804 0.078 356163.407
2 3.6 0.4 4.3 0.038 356158.397
3 12.2 19.9 7.29 0.028 356186.897
3 2 18.2 7.752 0.028 356176.697
3 6.5 19.9 7.2 0.028 356181.197
3 3.7 13.8 5.88 0.042 356178.397
3 4.9 10.3 9.234 0.061 356179.597
3 3.7 13.8 5.88 0.042 356178.397
3 4.9 10.3 9.234 0.061 356179.597
4 16.3 2.4 5.18 0.02 356242.769
4 15.7 9.8 10.92 0.096 356242.169
4 6 12.6 6.96 0.16 356232.469
5 19.4 16.4 8.2 0.092 356274.64
10 4.8 5.16 7.38 1.08 356534.113
11 14.7 16.2 16.44 0.89 356349.595
11 19 19 10.2 0.047 356353.895
Any suggestion will be appreciated. Thanks.
You could merge the two datasets and create a new column by summing the X.x and X.y variables.
res <- transform(merge(samp, coor, by='Plot'), new_x=X.x+X.y)[,-c(6:7)]
colnames(res) <- colnames(out) #`out` is the expected result showed
all.equal(res[1:22,], out, check.attributes=FALSE)
#[1] TRUE