Calculate a Weighted Rolling Average by rows by group in r? - r

I have a dataframe games_h. This is just a snippet of the table but it has many teams and is sorted by date, team, game number. I am trying to create a weighted rolling average grouped by the team. I would like the most recent game to be weighted more than two games ago. So the weights would be (Game_1 * 1+ Game_2 *2)/3 or weights equal to 1 with same ratio so weights = c(1-.667, .667).
dput(games_h)
structure(list(GameId = c(16, 16, 37, 37, 57, 57), GameDate = structure(c(17905,
17905, 17916, 17916, 17926, 17926), class = "Date"), NeutralSite = c(0,
0, 0, 0, 0, 0), AwayTeam = c("Virginia Cavaliers", "Virginia Cavaliers",
"Florida State Seminoles", "Florida State Seminoles", "Syracuse Orange",
"Syracuse Orange"), HomeTeam = c("Boston College Eagles", "Boston College Eagles",
"Boston College Eagles", "Boston College Eagles", "Boston College Eagles",
"Boston College Eagles"), Team = c("Virginia Cavaliers", "Boston College Eagles",
"Florida State Seminoles", "Boston College Eagles", "Syracuse Orange",
"Boston College Eagles"), Home = c(0, 1, 0, 1, 0, 1), Score = c(83,
56, 82, 87, 77, 71), AST = c(17, 6, 12, 16, 11, 13), TOV = c(10,
8, 9, 13, 11, 11), STL = c(5, 4, 4, 6, 6, 5), BLK = c(6, 0, 4,
4, 1, 0), Rebounds = c(38, 18, 36, 33, 23, 23), ORB = c(7, 4,
16, 10, 7, 6), DRB = c(31, 14, 20, 23, 16, 17), FGA = c(55, 57,
67, 55, 52, 45), FGM = c(33, 22, 28, 27, 29, 21), X3FGM = c(8,
7, 8, 13, 11, 9), X3FGA = c(19, 25, 25, 21, 26, 22), FTA = c(14,
9, 24, 28, 15, 23), FTM = c(9, 5, 18, 20, 8, 20), Fouls = c(16,
12, 25, 20, 19, 19), Game_Number = 1:6, Count = c(1, 1, 1, 1,
1, 1)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), groups = structure(list(HomeTeam = "Boston College Eagles",
.rows = structure(list(1:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))
Here is an example output of the score column.
Here is my failed attempt. The function work correctly but I cannot apply it to the entire dataframe by group.
weighted_avg<-function(x, wt1, wt2) {
rs1 = rollsum(x,1,align = "right")
rs2 = rollsum(x,2,align = "right")
rs1=rs1[-1]
rs3 = rs2 - rs1
weighted_avg= ((rs3 * wt2)+ (rs1*wt1))/(wt1+wt2)
return(weighted_avg)
}
weighted_avg(csum$Score_Y, 2, 1)
apply(csum$Score_Y , 2, weighted_avg, wt1 = 2, wt2=1)
test<-csum %>%
group_by(Team)%>%
group_map(across(c(Score:Fouls), weighted_avg(.x$Team, 2, 1) ))
test<-csum %>%
group_by(Team)%>%
group_walk(across(c(Score:Fouls),weighted_avg(.~,2,1) ))

Here are some notes about the code:
I used slider::slide_dbl function. First we specify the vector for which we would like to compute the moving average Score.
As we need a sliding window of length 2, I used .before argument in slide_dbl to use the previous value and a current value to be used for calculating moving average.
Also I set .complete argument to TRUE to makes sure to only calculate moving average when we have a previous value. In other word we don't have any moveing average in first row.
For more info check the documentation for slider package.
library(tidyverse)
library(slider)
df %>%
group_by(HomeTeam) %>%
summarise(Example = c(NA, slide_dbl(Score, .before = 1, .complete = TRUE,
.f = ~ (.x[1] * 1 + .x[2] * 2) / 3)))
`summarise()` has grouped output by 'HomeTeam'. You can override using the `.groups` argument.
# A tibble: 7 × 2
# Groups: HomeTeam [1]
HomeTeam Example
<chr> <dbl>
1 Boston College Eagles NA
2 Boston College Eagles NA
3 Boston College Eagles 65
4 Boston College Eagles 73.3
5 Boston College Eagles 85.3
6 Boston College Eagles 80.3
7 Boston College Eagles 73
If it is going to calculate moving average for all numeric columns you could try:
df %>%
group_by(HomeTeam) %>%
summarise(across(where(is.numeric), ~ c(NA, slide_dbl(., .before = 1, .complete = TRUE,
.f = ~ (.x[1] * 1 + .x[2] * 2) / 3)))) %>%
ungroup()
`summarise()` has grouped output by 'HomeTeam'. You can override using the `.groups` argument.
# A tibble: 7 × 21
HomeTeam GameId NeutralSite Home Score AST TOV STL BLK Rebounds ORB DRB FGA FGM
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Boston C… NA NA NA NA NA NA NA NA NA NA NA NA NA
2 Boston C… NA NA NA NA NA NA NA NA NA NA NA NA NA
3 Boston C… 16 0 0.667 65 9.67 8.67 4.33 2 24.7 5 19.7 56.3 25.7
4 Boston C… 30 0 0.333 73.3 10 8.67 4 2.67 30 12 18 63.7 26
5 Boston C… 37 0 0.667 85.3 14.7 11.7 5.33 4 34 12 22 59 27.3
6 Boston C… 50.3 0 0.333 80.3 12.7 11.7 6 2 26.3 8 18.3 53 28.3
7 Boston C… 57 0 0.667 73 12.3 11 5.33 0.333 23 6.33 16.7 47.3 23.7
# … with 7 more variables: X3FGM <dbl>, X3FGA <dbl>, FTA <dbl>, FTM <dbl>, Fouls <dbl>,
# Game_Number <dbl>, Count <dbl>

Related

How to find specific multi-variable thresholds for AUC cutpoint?

I want to figure out how to find the variable cutoff points for a glm. Cutpointr does this really well for single variable but with more complex models how would I go about extracting this info? Say I did ~ glucose + age + mass and wanted at this cutoff values x, y and z are present or is this not possible? would it be more like this number is the summed cutoff of the variables? Also sorry for no images it wouldn't let me add them in!
#for reprex
library(reprex)
library(mlbench)
library(tidymodels)
library(cutpointr)
#>
#> Attaching package: 'cutpointr'
#> The following objects are masked from 'package:yardstick':
#>
#> accuracy, npv, ppv, precision, recall, sensitivity, specificity
library(ggplot2)
library(tidyverse)
data(PimaIndiansDiabetes2)
head(PimaIndiansDiabetes2)
#> pregnant glucose pressure triceps insulin mass pedigree age diabetes
#> 1 6 148 72 35 NA 33.6 0.627 50 pos
#> 2 1 85 66 29 NA 26.6 0.351 31 neg
#> 3 8 183 64 NA NA 23.3 0.672 32 pos
#> 4 1 89 66 23 94 28.1 0.167 21 neg
#> 5 0 137 40 35 168 43.1 2.288 33 pos
#> 6 5 116 74 NA NA 25.6 0.201 30 neg
Diabetes <- na.omit(PimaIndiansDiabetes2)
glimpse(PimaIndiansDiabetes2)
#> Rows: 768
#> Columns: 9
#> $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
#> $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
#> $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N…
#> $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N…
#> $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17…
#> $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.…
#> $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
#> $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
#> $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
cp <- cutpointr(Diabetes, glucose, diabetes,
method = maximize_metric, metric = sum_sens_spec)
#> Assuming the positive class is pos
#> Assuming the positive class has higher x values
plot_cutpointr(cp)
summary(cp)
#> Method: maximize_metric
#> Predictor: glucose
#> Outcome: diabetes
#> Direction: >=
#>
#> AUC n n_pos n_neg
#> 0.8058 392 130 262
#>
#> optimal_cutpoint sum_sens_spec acc sensitivity specificity tp fn fp tn
#> 128 1.5055 0.7628 0.7231 0.7824 94 36 57 205
#>
#> Predictor summary:
#> Data Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
#> Overall 56 81.0 99.00 119.0 122.6276 143.00 181.00 198 30.86078 0
#> neg 56 79.0 94.00 107.5 111.4313 126.00 154.00 197 24.64213 0
#> pos 78 95.9 124.25 144.5 145.1923 171.75 188.55 198 29.83939 0
res_unnested <- cp %>%
unnest(cols = roc_curve)
annotation <- paste0("AUC: ", round(cp$AUC, 2), "\n",
"Cutpoint: ", round(cp$optimal_cutpoint, 2))
ggplot(res_unnested, aes(x = 1 - tnr, y = tpr)) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_bw() +
theme(aspect.ratio = 1) +
geom_line(color = "red") +
geom_vline(xintercept = 1 - cp$specificity, linetype = "dotted") +
geom_hline(yintercept = cp$sensitivity, linetype = "dotted") +
annotate("text", x = 0.85, y = 0.05, label = annotation) +
ggtitle("ROC curve", "Using glucose mg/dL as a predictive logistic variable for diabetes") +
geom_abline(intercept = 0, slope = 1, linetype = 2)
ROC(form = diabetes ~ glucose + age + mass, data=Diabetes, plot = "ROC", MX = T)
I have tried to add more parameters to cutpointr which was unsuccessful. I have also tried to run with Epi and saw a better AUC with age and mass included. I have also run a glm but I am just not sure how to properly analyze the glm for this type of information. Looking on the tidymodels website for help while waiting for suggestions, thanks!

How can I create a new column in R with a count of observations in another column in the dataset?

I have a dataset in R with information about individuals and diagnoses. The variables are group, age, weight, id and diagnosis. So an individual can have one row with X in diagnosis (meaning no diagnosis) or one or more rows with diagnoses. Now I want to make a new variable with the number of diagnoses each individual got so that each individual has one row in the dataset with the variables group, age, weight, id and number of diagnoses. In this new column with diagnosis I want individuals with no diagnosis to get the number 0, with one diagnosis the number 1, with two diagnoses the number 2 and etcetera. Can anyone help me?
I am using R. I tried to use group_by and count but I can not get the number 0 for individuals with no diagnosis (X in the diagnosis column) and I can not see the other variables like group, age and weight.
Here is the data:
pr <- read_csv("~/Desktop/Data.csv")
head(pr)
Data
dput(pr)
structure(list(GROUP = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3, 3, 3, 3, 4, 4, 4), AGE = c(23, 34, 61, 23, 45, 34, 34, 55,
56, 43, 56, 49, 61, 49, 74, 49, 51, 46, 75), WEIGHT = c(56, 72,
70, 56, 101, 72, 72, 62, 60, 78, 60, 55, 79, 55, 89, 55, 67,
60, 105), ID = c(4, 1, 2, 4, 3, 1, 1, 5, 7, 6, 7, 8, 9, 8, 10,
8, 11, 12, 13), DIAGNOSIS = c("J01", "J01", "X", "J01", "J01",
"J01", "J01", "J01", "J01", "J01", "J01", "J01", "X", "J01",
"J01", "J01", "X", "J01", "J01")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -19L), spec = structure(list(
cols = list(GROUP = structure(list(), class = c("collector_double",
"collector")), AGE = structure(list(), class = c("collector_double",
"collector")), WEIGHT = structure(list(), class = c("collector_double",
"collector")), ID = structure(list(), class = c("collector_double",
"collector")), DIAGNOSIS = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Picture of the desired output:
Desired output
One way to approach this is to group_by multiple columns, if the information is repeated for a given individual (which it does in this example). You will get these columns in your results in the end. Also, you can summarise where the DIAGNOSIS is not "X" - instead of count, so that you will get zero for cases where DIAGNOSIS is "X".
library(dplyr)
pr %>%
group_by(GROUP, ID, AGE, WEIGHT) %>%
summarise(NUMBER = sum(DIAGNOSIS != "X"))
Output
GROUP ID AGE WEIGHT NUMBER
<dbl> <dbl> <dbl> <dbl> <int>
1 1 1 34 72 3
2 1 2 61 70 0
3 1 3 45 101 1
4 1 4 23 56 2
5 2 5 55 62 1
6 2 6 43 78 1
7 2 7 56 60 2
8 3 8 49 55 3
9 3 9 61 79 0
10 3 10 74 89 1
11 4 11 51 67 0
12 4 12 46 60 1
13 4 13 75 105 1

How to create before and after scores in two different columns based on date?

I have two tables first table has stress score recorded at various time points and second table has date of treatment. I want to get the stress scores before and after treatment for each participant who has received the treatment. Also I want a column that gives information on when was the stress score recorded before and after treatment. I do not understand from where do I begin,and what should my code look like.
score.dt = data.table(
participant.index = c(1, 1, 1, 3, 4, 4, 13, 21, 21, 25, 37, 40, 41, 41, 41, 43, 43, 43, 44),
repeat.instance = c(2, 3, 6, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 2, 3, 1),
date.recorded = c(
'2017-07-13',
'2017-06-26',
'2018-09-17',
'2016-04-14',
'2014-03-24',
'2016-05-30',
'2018-06-20',
'2014-08-03',
'2015-07-06',
'2014-12-17',
'2014-09-05',
'2013-06-10',
'2015-10-04',
'2016-11-04',
'2016-04-18',
'2014-02-13',
'2013-05-24',
'2014-09-10',
'2014-11-25'
),
subscale = c(
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress",
"stress"
),
score = c(18, 10, 18, 36, 16, 30, 28, 10, 12, 40, 16, 12, 10, 14, 6, 32, 42, 26, 18)
)
date.treatment.dt = data.table (
participant.index = c(1, 4, 5, 6, 8, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26),
date.treatment = c(
'2018 - 06 - 27',
'2001 - 07 - 16',
'2009 - 12 - 09',
'2009 - 05 - 20',
'2009 - 07 - 22',
'2008-07 - 02',
'2009 - 11 - 25',
'2009 - 09 - 16',
'1991 - 07 - 30',
'2016 - 05 - 25',
'2012 - 07 - 25',
'2007 - 03 - 19',
'2012 - 01 - 25',
'2011 - 09 - 21',
'2000 - 03 - 06',
'2001 - 09 - 25',
'1999 - 12 - 20',
'1997 -07 - 28',
'2002 - 03 - 12',
'2008 - 01 - 23'
))
Desired output columns: is something like this
score.date.dt = c("candidate.index.x", "repeat.instance", "subscale", "score", "date.treatment", "date.recorded", "score.before.treatment", "score.after.treatment", "months.before.treatment", "months.after.treatment")
Here the columns months.before.treatment indicates how many months before treatment the stress score was measured and month.after.treatment indicates how many months after treatment the stress score was measured.
In your example set, you only have four individuals with stress scores that have any rows in the treatment table (participants 1,4,21,and 25). Only one of these, participant 1, has both a pre-treatment stress measures and post-treatment stress measure...
Here is one way to produce the information you need:
inner_join(score.dt,date.treatment.dt, by="participant.index") %>%
group_by(participant.index, date.treatment) %>%
summarize(pre_treatment = min(date.recorded[date.recorded<=date.treatment]),
post_treatment = max(date.recorded[date.recorded>=date.treatment])) %>%
pivot_longer(cols = -(participant.index:date.treatment), names_to = "period", values_to = "date.recorded") %>%
left_join(score.dt, by=c("participant.index", "date.recorded" )) %>%
mutate(period=str_extract(period,".*(?=_)"),
months = abs(as.numeric(date.treatment-date.recorded))/(365.25/12)) %>%
pivot_wider(id_cols = participant.index:date.treatment, names_from = period, values_from=c(date.recorded, subscale, months,score))
Output:
participant.index date.treatment date.recorded_pre date.recorded_post subscale_pre subscale_post months_pre months_post score_pre score_post
<dbl> <date> <date> <date> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 2018-06-27 2017-06-26 2018-09-17 stress stress 12.0 2.69 10 18
2 4 2001-07-16 NA 2016-05-30 NA stress Inf 178. NA 30
3 21 2000-03-06 NA 2015-07-06 NA stress Inf 184. NA 12
4 25 2002-03-12 NA 2014-12-17 NA stress Inf 153. NA 40
Note: you will have to fix the date inputs to the two source files, like this:
# first correct, your date.treatment column, and convert to date
date.treatment.dt[, date.treatment := as.Date(str_replace_all(date.treatment," ",""), "%Y-%m-%d")]
# second, similarly fix the date column in your stress score table
score.dt[,date.recorded := as.Date(date.recorded,"%Y-%m-%d")]
It seems like there are a few parts to what you're asking. First, you need to merge the two tables together. Here I use dplyr::inner_join() which automatically detects that the candidate.index is the only column in common and merges on that while discarding records found in only one of the tables. Second, we convert to a date format for both dates to enable the calculation of elapsed months.
library(tidyverse)
library(data.table)
library(lubridate)
score.dt <- structure(list(participant.index = c(1, 1, 1, 3, 4, 4, 13, 21, 21, 25, 37, 40, 41, 41, 41, 43, 43, 43, 44), repeat.instance = c(2, 3, 6, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 2, 3, 1), date.recorded = c("2017-07-13", "2017-06-26", "2018-09-17", "2016-04-14", "2014-03-24", "2016-05-30", "2018-06-20", "2014-08-03", "2015-07-06", "2014-12-17", "2014-09-05", "2013-06-10", "2015-10-04", "2016-11-04", "2016-04-18", "2014-02-13", "2013-05-24", "2014-09-10", "2014-11-25"), subscale = c("stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress", "stress"), score = c(18, 10, 18, 36, 16, 30, 28, 10, 12, 40, 16, 12, 10, 14, 6, 32, 42, 26, 18)), row.names = c(NA, -19L), class = c("data.table", "data.frame"))
date.treatment.dt <- structure(list(participant.index = c(1, 4, 5, 6, 8, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), date.treatment = c("2018 - 06 - 27", "2001 - 07 - 16", "2009 - 12 - 09", "2009 - 05 - 20", "2009 - 07 - 22", "2008-07 - 02", "2009 - 11 - 25", "2009 - 09 - 16", "1991 - 07 - 30", "2016 - 05 - 25", "2012 - 07 - 25", "2007 - 03 - 19", "2012 - 01 - 25", "2011 - 09 - 21", "2000 - 03 - 06", "2001 - 09 - 25", "1999 - 12 - 20", "1997 -07 - 28", "2002 - 03 - 12", "2008 - 01 - 23")), row.names = c(NA, -20L), class = c("data.table", "data.frame"))
inner_join(date.treatment.dt, score.dt) %>%
mutate(across(contains("date"), as_date)) %>%
mutate(months.after = interval(date.treatment, date.recorded) %/% months(1)) %>%
mutate(months.before = 0 - months.after)
#> Joining, by = "participant.index"
#> participant.index date.treatment repeat.instance date.recorded subscale
#> 1: 1 2018-06-27 2 2017-07-13 stress
#> 2: 1 2018-06-27 3 2017-06-26 stress
#> 3: 1 2018-06-27 6 2018-09-17 stress
#> 4: 4 2001-07-16 1 2014-03-24 stress
#> 5: 4 2001-07-16 2 2016-05-30 stress
#> 6: 21 2000-03-06 1 2014-08-03 stress
#> 7: 21 2000-03-06 2 2015-07-06 stress
#> 8: 25 2002-03-12 1 2014-12-17 stress
#> score months.after months.before
#> 1: 18 -11 11
#> 2: 10 -12 12
#> 3: 18 2 -2
#> 4: 16 152 -152
#> 5: 30 178 -178
#> 6: 10 172 -172
#> 7: 12 184 -184
#> 8: 40 153 -153
Created on 2022-04-05 by the reprex package (v2.0.1)

Data cleaning, from cross-sectional (multiple files) to panel in RStudio: merge/gather?

I have yearly observations for individuals on different variables from 2008-2020. I have data on family (25 variables), income (15 variables), and schooling (22 variables).
Right now, have 'cleaned' every single dataset so that every column of every category has the same column name. For context, this is what my R looks like now.
The thing is, I would like to have one big dataset with all of the individuals and years in one dataframe. I know that I should/could use the innerjoin or merge function first of all sorting by 'Householdmember', and that I could use the gather function, but I am truly struggling in what order I should do this and where I should start. I've been trying a lot of things, but considering the number of dataframes, it's hard to keep track of what I'm doing. I also created lists of every category for every year because this was recommended in one method, but that did not work out...
I want to end up with a dataframe that looks similar to this:
Individual
Year
Var1
Var2
1
2008
value
value
1
2009
value
value
1
2010
value
value
2
2008
value
value
2
2009
value
value
2
2010
value
value
What I should do as first step... If I merge the dataframes, I don't think R knows which values correspond to which year...
> head(fam08)
# A tibble: 6 x 25
HouseholdMember RandomChild YearBirthRandom Gender Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
<dbl> <dbl+lbl> <dbl> <dbl+l> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1 800033 16 [not ap… NA 1 [mal… 16 1952 1 [yes] 1961 1 [yes] 1 [yes] 7 [ever… 7 [ever…
2 800042 16 [not ap… NA 2 [fem… 32 1946 1 [yes] 1948 1 [yes] 2 [no] 4 [at l… 4 [at l…
3 800045 16 [not ap… NA 1 [mal… 65 1913 2 [no] 1915 2 [no] 2 [no] NA NA
4 800057 16 [not ap… NA 1 [mal… 33 1939 1 [yes] 1945 1 [yes] 1 [yes] 4 [at l… 4 [at l…
5 800076 16 [not ap… NA 2 [fem… 22 1955 1 [yes] 1955 1 [yes] 1 [yes] 5 [at l… 3 [a fe…
6 800119 16 [not ap… NA 2 [fem… 57 1908 2 [no] 1918 2 [no] 2 [no] NA NA
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
# SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
# Year <dbl>
> head(fam09)
# A tibble: 6 x 25
HouseholdMember RandomChild YearBirthRandom Gender Age FatherBirth FatherAlive MotherBirth MotherAlive Divorce SeeFather SeeMother
<dbl> <dbl+lbl> <dbl> <dbl+l> <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+l> <dbl+lbl> <dbl+lbl>
1 800033 16 [not ap… NA 1 [mal… 17 1952 1 [yes] 1961 1 [yes] NA 5 [at l… 7 [ever…
2 800042 16 [not ap… NA 2 [fem… 33 1946 1 [yes] 1948 1 [yes] NA 4 [at l… 4 [at l…
3 800057 16 [not ap… NA 1 [mal… 34 1939 1 [yes] 1945 1 [yes] NA 3 [a fe… 3 [a fe…
4 800076 16 [not ap… NA 2 [fem… 23 1955 1 [yes] 1955 1 [yes] NA 5 [at l… 3 [a fe…
5 800119 16 [not ap… NA 2 [fem… 58 NA NA NA NA NA NA NA
6 800125 16 [not ap… NA 2 [fem… 50 NA NA 1928 1 [yes] NA NA 1 [neve…
# … with 13 more variables: Married <dbl+lbl>, Child <dbl+lbl>, NumChild <dbl>, SchoolCH1 <dbl+lbl>, SchoolCH2 <dbl+lbl>,
# SchoolCH3 <dbl+lbl>, SchoolCH4 <dbl+lbl>, BirthCH1 <dbl>, BirthCH2 <dbl>, BirthCH3 <dbl>, BirthCH4 <dbl>, FamSatisfaction <dbl+lbl>,
# Year <dbl>
dput(head(fam09,10))
structure(list(HouseholdMember = c(800033, 800042, 800057, 800076,
800119, 800125, 800170, 800186, 800201, 800204), RandomChild = structure(c(16,
16, 16, 16, 16, 16, 3, 16, 16, 16), label = "Randomly chosen child", labels = c(`child 1` = 1,
`child 2` = 2, `child 3` = 3, `child 4` = 4, `child 5` = 5, `child 6` = 6,
`child 7` = 7, `child 8` = 8, `child 9` = 9, `child 10` = 10,
`child 11` = 11, `child 12` = 12, `child 13` = 13, `child 14` = 14,
`child 15` = 15, `not applicable` = 16), class = "haven_labelled"),
YearBirthRandom = c(NA, NA, NA, NA, NA, NA, 1999, NA, NA,
NA), Gender = structure(c(1, 2, 1, 2, 2, 2, 2, 2, 1, 1), label = "Gender respondent", labels = c(male = 1,
female = 2), class = "haven_labelled"), Age = c(17, 33, 34,
23, 58, 50, 50, 69, 35, 67), FatherBirth = structure(c(1952,
1946, 1939, 1955, NA, NA, 1926, NA, 1948, NA), label = "What is the year of birth of your father?", labels = c(`I don't know` = 99999), class = "haven_labelled"),
FatherAlive = structure(c(1, 1, 1, 1, NA, NA, 1, NA, 1, NA
), label = "Is your father still alive?", labels = c(yes = 1,
no = 2, `I don't know` = 99), class = "haven_labelled"),
MotherBirth = structure(c(1961, 1948, 1945, 1955, NA, 1928,
1931, NA, 1950, NA), label = "What is the year of birth of your mother?", labels = c(`I don't know` = 99999), class = "haven_labelled"),
MotherAlive = structure(c(1, 1, 1, 1, NA, 1, 1, NA, 1, NA
), label = "Is your mother still alive?", labels = c(yes = 1,
no = 2, `I don't know` = 99), class = "haven_labelled"),
Divorce = structure(c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), label = "Did your own parents ever divorce?", labels = c(yes = 1,
no = 2, `my parents never had a relationship` = 3, `I don't know` = 99
), class = "haven_labelled"), SeeFather = structure(c(5,
4, 3, 5, NA, NA, 6, NA, 3, NA), label = "How often did you see your father over the past 12 months?", labels = c(never = 1,
once = 2, `a few times` = 3, `at least every month` = 4,
`at least every week` = 5, `a few times per week` = 6, `every day` = 7
), class = "haven_labelled"), SeeMother = structure(c(7,
4, 3, 3, NA, 1, 6, NA, 3, NA), label = "How often did you see your mother over the past 12 months?", labels = c(never = 1,
once = 2, `a few times` = 3, `at least every month` = 4,
`at least every week` = 5, `a few times per week` = 6, `every day` = 7
), class = "haven_labelled"), Married = structure(c(NA, 1,
2, 2, 1, 2, 1, 1, 1, 1), label = "Are you married to this partner?", labels = c(yes = 1,
no = 2), class = "haven_labelled"), Child = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), label = "Have you had any children?", labels = c(yes = 1,
no = 2), class = "haven_labelled"), NumChild = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), SchoolCH1 = structure(c(NA,
NA, NA, NA, NA, NA, 4, NA, NA, NA), label = "What school does child 1 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH2 = structure(c(NA,
NA, NA, NA, NA, NA, 3, NA, NA, NA), label = "What school does child 2 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH3 = structure(c(NA,
NA, NA, NA, NA, NA, 1, NA, NA, NA), label = "What school does child 3 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), SchoolCH4 = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), label = "What school does child 4 (born in the years 1991 through 2004) attend?", labels = c(`primary school` = 1,
`school for special primary education` = 2, `secondary school` = 3,
other = 4), class = "haven_labelled"), BirthCH1 = c(NA, 2005,
2007, NA, 1983, NA, 1991, 1964, NA, 1974), BirthCH2 = c(NA,
2007, NA, NA, 1985, NA, 1994, 1966, NA, 1976), BirthCH3 = c(NA,
NA, NA, NA, NA, NA, 1999, 1970, NA, NA), BirthCH4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), FamSatisfaction = structure(c(NA,
8, 9, NA, 8, NA, 8, NA, NA, NA), label = "How satisfied are you with your family life?", labels = c(`entirely dissatisfied` = 0,
`entirely satisfied` = 10, `I don’t know` = 999), class = "haven_labelled"),
Year = c(2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009, 2009)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
I believe you could do something along these lines:
fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)
result = fam %>%
left_join(inc, by=c("HouseholdMember", "Year")) %>%
left_join(ws, by=c("HouseholdMember", "Year"))
Output:
HouseholdMember Year fam_v1 fam_v2 fam_v3 inc_v1 inc_v2 inc_v3 ws_v1 ws_v2 ws_v3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 8001 2008 0.609 -0.253 -1.30 0.0147 0.719 -0.765 0.120 0.974 -0.764
2 8002 2008 0.395 1.73 -0.503 0.119 -3.33 -0.798 0.325 0.664 1.65
3 8003 2008 0.562 0.157 0.243 -1.18 -0.260 0.105 1.09 0.855 1.19
4 8004 2008 1.32 0.737 -1.18 0.725 -1.82 0.356 0.362 2.04 1.76
5 8005 2008 -0.497 -0.444 -0.632 -0.534 1.63 0.984 1.29 0.614 0.576
6 8006 2008 -1.70 -0.989 -1.32 0.868 0.0979 0.468 -0.0146 1.11 0.957
7 8007 2008 -2.19 -0.419 1.69 1.34 -0.404 -1.43 -0.156 0.648 -0.186
8 8008 2008 1.48 0.350 -0.595 0.785 -0.609 1.28 -1.01 1.04 0.845
9 8009 2008 -0.315 -0.530 0.419 0.390 -0.0951 -0.755 0.135 0.696 -1.97
10 8010 2008 -0.882 1.38 2.06 -0.0757 1.53 -0.494 -1.03 1.14 1.87
Note:
I manufactured the data for this example by creating a lists of tibbles; I believe the fam_list, inc_list, and ws_list are similar to the list objects in your image. These are list of data frames / tibbles. I then use bind_rows to bind these similar structure tibbles together so that I have a three large tibbles.
I then use left_join twice to join inc and ws to fam
Input Data:
library(tidyverse)
fam_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
fam_v1=rnorm(100),
fam_v2=rnorm(100),
fam_v3=rnorm(100)
)
})
names(fam_list) = paste0("fam_20", 8:20)
inc_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
inc_v1=rnorm(100),
inc_v2=rnorm(100),
inc_v3=rnorm(100)
)
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
ws_v1=rnorm(100),
ws_v2=rnorm(100),
ws_v3=rnorm(100)
)
})
names(ws_list) = paste0("ws_20", 8:20)
Input

Multiple Variables (in columns), multiple Years(in columns) to reshape to flatfile in R

I have data in the following format, with Variables, data by years and where A, B, C, D are the row id's.
Variable 1 blank column Variable 2
2008 2009 2010 2011 2008 2009 2010 2011
A 1 5 9 13 5 10 15 20
B 2 6 10 14 25 30 35 40
C 3 7 11 15 45 50 55 60
D 4 8 12 16 65 70 75 80
I would like to get it in this format:
Variable Year Data
A Variable1 2008 1
A Variable1 2009 5
.....
.....
D Variable2 2010 75
D Variable2 2011 80
I thought of using gather from library(tidyr) but I cant figure out how to do it. Sorry do not have a reproducible example.
structure(list(X1 = c(NA, "A", "B", "C", "D"), Variable1 = c(2008,
1, 2, 3, 4), X3 = c(2009, 5, 6, 7, 8), X4 = c(2010, 9, 10, 11,
12), X5 = c(2011, 13, 14, 15, 16), Variable1 = c(2008, 5, 25,
45, 65), X7 = c(2009, 10, 30, 50, 70), X8 = c(2010, 15, 35, 55,
75), X9 = c(2011, 20, 40, 60, 80)), .Names = c("X1", "Variable1",
"X3", "X4", "X5", "Variable1", "X7", "X8", "X9"), row.names = c(NA,
5L), class = "data.frame")
library(tidyverse)
names(df) <- c("row_name",
paste(c(t(replicate(4, names(df)[1 + seq(1, length.out=floor(length(names(df))/4), by=4)]))),
df[1,-1],
sep="_"))
df[-1,] %>%
gather(Variable_Year, Data, -row_name) %>%
separate(Variable_Year, into=c("Variable", "Year"), sep="_") %>%
arrange(row_name, Variable, Year)
Note that you can't have non-unique values as "row names" of a dataframe so you may need to think of an alternative way to handle below row_name column.
Output is:
row_name Variable Year Data
1 A Variable1 2008 1
2 A Variable1 2009 5
...
31 D Variable2 2010 75
32 D Variable2 2011 80
Sample data:
df -> structure(list(row_name = c(NA, "A", "B", "C", "D"), Variable1_2008 = c(2008,
1, 2, 3, 4), Variable1_2009 = c(2009, 5, 6, 7, 8), Variable1_2010 = c(2010,
9, 10, 11, 12), Variable1_2011 = c(2011, 13, 14, 15, 16), Variable2_2008 = c(2008,
5, 25, 45, 65), Variable2_2009 = c(2009, 10, 30, 50, 70), Variable2_2010 = c(2010,
15, 35, 55, 75), Variable2_2011 = c(2011, 20, 40, 60, 80)), .Names = c("row_name",
"Variable1_2008", "Variable1_2009", "Variable1_2010", "Variable1_2011",
"Variable2_2008", "Variable2_2009", "Variable2_2010", "Variable2_2011"
), row.names = c(NA, 5L), class = "data.frame")

Resources