Impute missing values with average of previous 13 values - r

I have a dataset with few missing observations. My objective is to impute the missing value in each variable with the average of previous 13 values. In case there is a missing value before the 13th observation, the average of whatever there before should be used for imputing that variable. I am not sure how to do it.
Please use the below to replicate my dataset. Your help is much appreciated.
df1 <- structure(list(V1 = c(276.12, 53.4, 20.64, 181.8, 216.96, 10.44,
69, 144.24, 10.32, 239.76, 79.32, 257.64, 28.56, 117, 244.92,
234.48, NA, 337.68, 83.04, 176.76, 262.08, 284.88, 15.84, NA,
74.76, 315.48, 171.48, 288.12, 298.56, 84.72, 351.48, 135.48,
NA, 318.72, 114.84, 348.84, 320.28, 89.64, 51.72, 273.6, 243,
212.4, 352.32, 248.28, NA, 210.12, 107.64, 287.88, 272.64, 80.28,
239.76, 120.48, 259.68, 219.12, 315.24, 238.68, 8.76, 163.44,
252.96), V2 = c(45.36, 47.16, 55.08, 49.56, 12.96, 58.68, 39.36,
NA, 2.52, 3.12, 6.96, 28.8, NA, 9.12, 39.48, 57.24, 43.92, 47.52,
24.6, 28.68, 33.24, 6.12, 19.08, 20.28, 15.12, 4.2, 35.16, NA,
32.52, 19.2, 33.96, 20.88, 1.8, 24, 1.68, NA, 52.56, 59.28, 32.04,
45.24, 26.76, 40.08, 33.24, 10.08, 30.84, 27, 11.88, 49.8, 18.96,
14.04, 3.72, 11.52, 50.04, 55.44, 34.56, NA, 33.72, 23.04, 59.52
)), class = "data.frame", row.names = c(NA, -59L))

You can use zoo::rollapply to compute the mean over the 13 values:
mean13 = zoo::rollapply(
df1$V1,
13,
function(x) {
mean(na.omit(x))
},
align = "right",
fill = NA,
partial = TRUE
)
df1$V1_prev_mean = c(df1$V1[1], head(mean13, -1))
df1$V1 = ifelse(is.na(df1$V1), df1$V1_prev_mean, df1$V1)
Output:
V1 V2 V1_prev_mean
1 276.1200 45.36 276.1200
2 53.4000 47.16 276.1200
3 20.6400 55.08 164.7600
4 181.8000 49.56 116.7200
5 216.9600 12.96 132.9900
6 10.4400 58.68 149.7840
7 69.0000 39.36 126.5600
8 144.2400 NA 118.3371
9 10.3200 2.52 121.5750
10 239.7600 3.12 109.2133
11 79.3200 6.96 122.2680
12 257.6400 28.80 118.3636
13 28.5600 NA 129.9700
14 117.0000 9.12 122.1692
15 244.9200 39.48 109.9292
16 234.4800 57.24 124.6615
17 141.1108 43.92 141.1108 # <- this row filled
18 337.6800 47.52 137.7200
19 83.0400 24.60 147.7800
20 176.7600 28.68 153.8300

Related

Avoid hard-coding with pivot longer to pivot multiple columns at once

I want to pivot_long() multiple columns of the dataset below avoiding hard-coding. I've seen some similar questions, but I still cannot do it.
Wide data:
> head(data)
ID IND_TEST_SCORE ARG_G1_ABC NARR_G1_ABC ARG_G1_EF NARR_G1_EF ARG_G2_ABC NARR_G2_ABC
1 PART_1 100 68.53 71.32 4.94 3.42 64.90 64.25
2 PART_2 36 65.90 NA 6.55 NA 63.80 59.00
3 PART_3 32 69.78 NA 2.44 NA 71.73 NA
4 PART_4 96 68.29 67.83 3.00 3.17 67.67 67.88
5 PART_5 11 NaN NaN NaN NaN NA 67.08
6 PART_6 12 69.50 71.60 3.25 2.50 NA NA
ARG_G2_EF NARR_G2_EF
1 7.10 5.08
2 7.40 7.00
3 1.09 NA
4 3.67 1.76
5 NA 3.00
6 NA NA
Desired output:
ID IND_TEST_SCORE ABC EF GROUP TYPE
1 PART_1 100 G1 ARG
1 PART_1 100 G1 NARR
1 PART_1 100 G2 ARG
1 PART_1 100 G2 NARR
2 PART_2 36 G1 ARG
2 PART_2 36 G1 NARR
2 PART_2 36 G2 ARG
2 PART_2 36 G2 NARR
so on...
Questions: how can I:
Create a new column called "GROUP" with 'G1' and G2' values
Create a new column called "TYPE" with 'ARG' and NARR' values
Create 2 new columns, one for "ABC" values and another for "EF" values
without hard-coding it? I'd like to work with patterns...Thanks in advance!
My attempt so far:
# create a single "my_names" columns and work on it:
dataLong <- data %>%
pivot_longer(cols = c(-ID, -IND_TEST_SCORE),
names_to = "my_names",
values_to = "my_values") %>%
mutate(GROUP = case_when(my_names == "ARG_G1_ABC" ~ "G1",
my_names == "ARG_G1_ABC" ~ "G2",
my_names == "ARG_G1_EF" ~ "G1",
my_names == "ARG_G2_EF" ~ "G2",
my_names == "NARR_G1_ABC" ~ "G1",
my_names == "NARR_G1_ABC" ~ "G2",
my_names == "NARR_G1_EF" ~ "G1",
my_names == "NARR_G2_EF" ~ "G2")) %>%
mutate(TYPE = case_when(my_names == "ARG_G1_ABC" ~ "ARG",
my_names == "ARG_G2_ABC" ~ "ARG",
my_names == "ARG_G1_EF" ~ "ARG",
my_names == "ARG_G2_EF" ~ "ARG",
my_names == "NARR_G1_ABC" ~ "NARR",
my_names == "NARR_G2_ABC" ~ "NARR",
my_names == "NARR_G1_EF" ~ "NARR",
my_names == "NARR_G2_EF" ~ "NARR"))
Dataset:
> dput(data)
structure(list(ID = structure(c("PART_1", "PART_2", "PART_3",
"PART_4", "PART_5", "PART_6", "PART_7", "PART_8", "PART_9", "PART_10",
"PART_11", "PART_12", "PART_13", "PART_14", "PART_15", "PART_16",
"PART_17", "PART_18", "PART_19", "PART_20", "PART_21", "PART_22",
"PART_23", "PART_24", "PART_25", "PART_26", "PART_27", "PART_28",
"PART_29", "PART_30", "PART_31", "PART_32", "PART_33", "PART_34",
"PART_35", "PART_36", "PART_37", "PART_38", "PART_39", "PART_40",
"PART_41", "PART_42", "PART_43", "PART_44", "PART_45", "PART_46",
"PART_47", "PART_48", "PART_49", "PART_50", "PART_51", "PART_52",
"PART_53", "PART_54", "PART_55", "PART_56", "PART_57", "PART_58",
"PART_59", "PART_60", "PART_61", "PART_62", "PART_63", "PART_64",
"PART_65", "PART_66", "PART_67", "PART_68", "PART_69", "PART_70",
"PART_71"), class = c("glue", "character")), IND_TEST_SCORE = c(100,
36, 32, 96, 11, 12, 32, 72, 100, 64, 2, 19, 99, 86, 60, 108,
95, 35, 60, 9, 78, 61, 61, 67, 105, 99, 51, 21, 65, 30, 0.9,
77, 54, 14, 103, 48, 0.7, 2, 39, 94, 80, 8, 30, 103, 113, 91,
59, 56, 86, 99, 72, 34, 32, 6, 44, 99, 65, 98, 110, 102, 87,
50, 89, 36, 93, 8, 11, 78, 48, 77, 4), ARG_G1_ABC = c(68.53,
65.9, 69.78, 68.29, NaN, 69.5, 67.05, 73.74, 73.59, 72.57, 64.33,
67.79, 72.94, 63.75, 71.56, 75.5, 68.16, NA, 65.64, 68.36, 69.75,
72.73, 67.67, 66.19, 62.94, 72.48, 72.19, 62.44, 72.5, 71.06,
70.4, 69.14, NA, 67.59, 69.1, 74.05, NA, 68.6, 68.27, 59.12,
NA, NA, 63.7, 67.18, NA, 68.38, 63.44, 72.56, 66.06, 66.53, 73.19,
NA, NA, NA, 73.44, 67.45, 72.91, 65.81, 73.96, 75, 75.89, 72,
NA, 68.2, 67.29, 69.91, NaN, 69.67, 68.39, 69.2, 67.55), NARR_G1_ABC = c(71.32,
NA, NA, 67.83, NaN, 71.6, 64.2, 71.68, 73.29, 70.53, 73.35, 59.31,
71.08, 74.06, 68.7, 74, 69.08, NA, 68.52, 63.47, 68.33, NA, 65.64,
62.11, 63.9, 70.41, 60.36, 65.88, 68.81, 69.62, 70.68, 67.5,
NA, 68.45, 67.16, 74.39, 60.6, 65.89, 71.94, 68.75, NA, NA, 67,
66.85, NA, NA, 62.56, 73.33, 69.81, 67.68, 73.06, 65.8, 63.85,
NA, 67.64, 71.6, 68.47, 69.39, 71.16, 72.33, NA, 66.68, NA, 66.22,
67, 61.27, NaN, 72.33, 68.29, 71.33, 65.57), ARG_G1_EF = c(4.94,
6.55, 2.44, 3, NaN, 3.25, 4.71, 2.84, 1.07, 2, 5.33, 5.43, 1.72,
10.55, 3, 1.17, 5.8, NA, 10.55, 4.21, 2.94, 3.55, 6.33, 8.25,
5.88, 2, 3.44, 9.22, 1.69, 4.18, 2.5, 4.71, NA, 4.41, 5.9, 2.21,
NA, 6.67, 3.33, 7, NA, NA, 8, 4.76, NA, 4.44, 2.68, 3.16, 4.94,
5.42, 2.81, NA, NA, NA, 1.78, 6.09, 2.52, 6.56, 1.96, 1.12, 0.67,
3.78, NA, 3.5, 3.65, 5.27, NaN, 4.33, 6.78, 3.6, 4.35), NARR_G1_EF = c(3.42,
NA, NA, 3.17, NaN, 2.5, 3.29, 1.64, 1.07, 6, 1.41, 9.25, 3.25,
2.69, 3.8, 1.32, 3.04, NA, 2.38, 5.18, 2.38, NA, 6.18, 6.11,
6.4, 1.85, 7.45, 3.69, 1.89, 3.25, 1.6, 4.8, NA, 2.8, 4.32, 2.3,
6.6, 7.42, 2.83, 4.75, NA, NA, 5, 4.75, NA, NA, 8, 1.71, 2.67,
2.05, 1.47, 4.8, 7.96, NA, 4.43, 3.8, 4.47, 4.91, 1.68, 2.78,
NA, 6.58, NA, 6.67, 6, 5.18, NaN, 1.67, 4.86, 2.08, 4.38), ARG_G2_ABC = c(64.9,
63.8, 71.73, 67.67, NA, NA, 52.5, 72.35, 65.28, 57.22, NA, NaN,
69, 66.67, NaN, 66.58, 69, 60.55, 56.29, 67.45, 68.4, 64.25,
NaN, 50.86, 67.83, 65.96, 57, 53.07, 66.89, NaN, NA, 59, 61.5,
NA, 65.9, 64.07, NA, NA, 57.91, 67.89, 68.75, 68.5, NaN, 63.24,
66.19, 60.59, 59.24, 54.33, 64.39, 65.83, 65.71, 63, 63.78, 63.62,
64, 65.08, NA, 67.61, 67.57, 72.71, 65.46, 61.71, NA, 57.62,
NA, NA, NA, 64, 61.33, 62.64, NA), NARR_G2_ABC = c(64.25, 59,
NA, 67.88, 67.08, NA, 60.75, 64.42, 71.17, 58.42, NA, 49.8, 63.36,
65.2, NaN, 70.2, 62.85, NaN, 61.6, 53.92, 62.63, NA, NaN, 50.46,
65.14, 60.58, 63.29, NA, 64.33, NaN, NA, 68.57, NA, NA, 66.3,
NA, 57.29, NA, 53.5, 63.48, NA, 57.07, NaN, 61.82, NA, 68.61,
57.1, 62.84, 63, 61.91, 58.38, NaN, 61.56, NA, NaN, 65.55, 63.8,
65, 63.14, 67.31, 67.75, 57.62, 63.31, 54.83, 66.43, NA, NA,
64.67, 57.92, 59, NA), ARG_G2_EF = c(7.1, 7.4, 1.09, 3.67, NA,
NA, 12.75, 1.24, 3.28, 9.78, NA, NaN, 1.71, 1.93, NaN, 6.21,
2.76, 7.91, 8.65, 3.55, 3.4, 5, NaN, 16.05, 3.39, 4.52, 13, 11.6,
5.05, NaN, NA, 9.5, 9.67, NA, 7.03, 3.87, NA, NA, 8, 3.33, 2.19,
3, NaN, 8.53, 3.37, 5.47, 7.35, 13.48, 5.33, 3.83, 3.65, 5.82,
4, 6.17, 6, 6.42, NA, 3.83, 2.71, 2.19, 4.58, 5.18, NA, 9.75,
NA, NA, NA, 5, 6.44, 5.36, NA), NARR_G2_EF = c(5.08, 7, NA, 1.76,
3, NA, 8.88, 4.26, 2.92, 7.08, NA, 10.6, 5.5, 4.16, NaN, 2.87,
4.7, NaN, 7, 9.5, 4.68, NA, NaN, 12.75, 4.77, 9.15, 5, NA, 5.44,
NaN, NA, 4.57, NA, NA, 1.7, NA, 11.29, NA, 13.33, 5.95, NA, 10.79,
NaN, 5.18, NA, 5.22, 7.1, 3.53, 5.75, 6.77, 6.31, NaN, 7.88,
NA, NaN, 3, 4.88, 4.69, 6.19, 10.31, 3.62, 9.75, 5.46, 6.83,
4.43, NA, NA, 3.67, 8.67, 8.53, NA)), row.names = c(NA, -71L), class = "data.frame")
We may use pivot_longer - specify the columns with matches that match the column names substring _ABC or _EF at the end ($) of the string and split the column names at _ by specifying names_sep as _ as well as specify the corresponding column names in names_to (.value will return the value of the columns where as TYPE or GROUP gets the first and second substring from column names
library(tidyr)
pivot_longer(data, cols = matches('_(ABC|EF)$'),
names_to = c("TYPE", "GROUP", ".value"),
names_sep = "_", values_drop_na = TRUE)
-output
# A tibble: 217 × 6
ID IND_TEST_SCORE TYPE GROUP ABC EF
<glue> <dbl> <chr> <chr> <dbl> <dbl>
1 PART_1 100 ARG G1 68.5 4.94
2 PART_1 100 NARR G1 71.3 3.42
3 PART_1 100 ARG G2 64.9 7.1
4 PART_1 100 NARR G2 64.2 5.08
5 PART_2 36 ARG G1 65.9 6.55
6 PART_2 36 ARG G2 63.8 7.4
7 PART_2 36 NARR G2 59 7
8 PART_3 32 ARG G1 69.8 2.44
9 PART_3 32 ARG G2 71.7 1.09
10 PART_4 96 ARG G1 68.3 3
# … with 207 more rows

Get position indices after min / max aggregation on matrix

Sequel to my first problem here (How to aggregate hourly values into 24h-average means without timestamp).
Now I want to calculate the max (and min) from my timeseries of each 12-hour interval.
I have got my hourly data measurements (data_measure). Now I changed it into a time series of half-days.
t_measure <- ts(data = data_measure, frequency = 12)
then I used the aggregate function from {stats}
data_measure_daily_max <- aggregate(t_measure, 1, max)
data_measure <- structure(c(8.29, 7.96, 8.14, 7.27, 7.37, 7.3, 7.23, 7.53,
7.98, 10.2, 12.39, 14.34, 14.87, 14.39, 12.54, 11.84, 10.3, 10.62,
10.65, 10.56, 10.43, 10.35, 9.85, 9.12, 8.95, 8.82, 8.92, 9.33,
9.44, 9.3, 9.15, 9.37, 9.54, 10.24, 12.13, 12.43, 12.65, 13,
13.18, 13.58, 13.64, 13.75, 13.85, 13.94, 13.79, 13.84, 13.94,
14.26, 24.93, 24.64, 23.67, 21.46, 21.33, 20.83, 21.12, 21.1,
23.75, 25.39, 30.72, 30.71, 30.81, 30.92, 32.61, 32.37, 32.49,
30.68, 30.23, 30.45, 28.1, 26.9, 25.09, 25.07, 24.59, 24.22,
23.05, 22.21, 22.07, 21.6, 21.24, 21.22, 21.85, 24.87, 28.85,
29.42, 30.82, 30.97, 31.32, 30.81, 30.83, 29.9, 30.01, 30.31,
30, 27.91, 25.78, 25.88, 8.78, 8.47, 8.49, 7.65, 8.63, 9.02,
9.02, 8.11, 7.63, 9.19, 11.25, 12.24, 13.62, 12.09, 10.6, 11.1,
10.16, 10.44, 9.58, 10.04, 10.01, 10.23, 9.51, 9.2, 9.34, 9.6,
9.4, 9.45, 9.36, 9.26, 9.3, 9.46, 9.58, 9.89, 10.6, 11.04, 12.1,
12.61, 13.12, 13.47, 13.55, 13.51, 13.63, 13.84, 13.93, 14.17,
13.97, 13.86), .Dim = c(48L, 3L), .Dimnames = list(NULL, c("station1",
"station2", "station3")))
So actually I need an index/vector which tells me where my max and min of these time intervals are, so later on I can extract exactly these for an other data sets to make a comparison.
My first trial:
max_index <- which(aggregate(t_measure, 1, max)) # argument to 'which' is not logical
Use which.max and which.min with aggregate
a1 <- aggregate(t_measure, 1, which.min)
a2 <- aggregate(t_measure, 1, which.max)
a1
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 7 6 9
#2 12 12 12
#3 2 8 6
#4 1 11 1
a2
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 12 11 12
#2 1 3 1
#3 12 12 12
#4 12 3 10
If you want index for min with reference to original data_measure dataframe we can do
vals <- nrow(t_measure)/12
index_min <- a1 + (12 * (seq_len(vals) - 1))
index_min
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 7 6 9
#2 24 24 24
#3 26 32 30
#4 37 47 37
This can be read as for station1 in 1st 12 hour interval max value is present in 7th row of data_measure, for next 12 hour interval it is present in 24th row and same for other stations.

How to make a graph to represent mean, standard derivation, maximum and minimum values?

I want to make a graph with monthly data: mean, standard deviation and maximum and minimum values.
X axis would be my months and I would like to represent my mean with dots, squares and cross, for exemple and my standard deviation represented by delimited vertical lines in the mean symbols.
And still, represent my maximum and minimum values by an area.
I would like to plot 3 different periods: 1961-1990, 1990-2010 and 1961-2010.
Is it possible?
Some data:
Mês;Mean1;Std1;Min1;Max1;Mean2;Std2;Min2;Max2;Mean3;Std3;Min3;Max3
Jan;25.45;2.04;13.05;27.50;25.83;1.94;14.01;27.85;25.54;2.03;13.24;27.58
Feb;25.74;2.09;13.02;27.85;26.16;2.01;13.95;28.16;25.92;2.04;13.58;27.99
Mar;25.01;2.13;12.12;27.27;25.35;2.14;12.41;27.67;25.16;2.07;12.68;27.45
Apr;23.16;2.19;9.89;25.48;23.81;2.35;9.62;26.35;23.51;2.17;10.46;25.90
May;21.17;2.21;7.99;23.59;21.31;2.29;7.54;23.88;21.18;2.23;7.84;23.67
Jun;19.88;2.26;6.37;22.34;20.15;2.25;6.65;22.65;20.00;2.26;6.42;22.47
Jul;19.41;2.27;5.78;21.79;19.96;2.10;7.34;22.25;19.60;2.22;6.24;22.02
Aug;20.39;2.10;7.73;22.64;20.75;2.03;8.56;23.00;20.53;2.09;7.93;22.80
Sep;21.08;1.96;9.26;23.29;21.66;1.58;12.21;23.53;21.33;1.91;9.84;23.53
Oct;22.19;1.81;11.33;24.32;23.17;1.62;13.40;25.00;22.60;1.79;11.92;24.76
Nov;23.42;1.90;11.94;25.52;23.89;1.64;13.96;25.68;23.60;1.82;12.63;25.67
Dec;24.39;1.98;12.39;26.39;25.17;1.99;13.07;27.54;24.67;1.94;12.93;26.73
Short answer:
Yes that is indeed possible.
Long answer:
Here is how you could do it:
Assuming that the data you posted is in some data.frame called df:
head(df)
Mês Mean1 Std1 Min1 Max1 Mean2 Std2 Min2 Max2 Mean3 Std3 Min3 Max3
1 Jan 25.45 2.04 13.05 27.50 25.83 1.94 14.01 27.85 25.54 2.03 13.24 27.58
2 Feb 25.74 2.09 13.02 27.85 26.16 2.01 13.95 28.16 25.92 2.04 13.58 27.99
3 Mar 25.01 2.13 12.12 27.27 25.35 2.14 12.41 27.67 25.16 2.07 12.68 27.45
4 Apr 23.16 2.19 9.89 25.48 23.81 2.35 9.62 26.35 23.51 2.17 10.46 25.90
5 May 21.17 2.21 7.99 23.59 21.31 2.29 7.54 23.88 21.18 2.23 7.84 23.67
6 Jun 19.88 2.26 6.37 22.34 20.15 2.25 6.65 22.65 20.00 2.26 6.42 22.47
You first want to convert it from a wide format to a long format, meaning that we want each observation to have its own row. Perhaps someone with more tidyverse experience can do this in a more elegant way, but this is how I would do that:
# First we melt the dataframe
df2 <- reshape2::melt(df, id.vars = "Mês")
# Then we get a grouping variable from the column "variable"
df2$variable <- as.character(df2$variable)
df2$group <- substr(df2$variable, nchar(df2$variable), nchar(df2$variable))
# And we remove the trailing number from the variable
df2$variable <- substr(df2$variable, 1, nchar(df2$variable) - 1)
This is what the data will look like at this point:
head(df2)
Mês variable value group
1 Jan Mean 25.45 1
2 Feb Mean 25.74 1
3 Mar Mean 25.01 1
4 Apr Mean 23.16 1
5 May Mean 21.17 1
6 Jun Mean 19.88 1
We still need the means, standard deviations, minima and maxima to be on the same row, so we are going to un-melt (cast) the data by each group:
# First we split by group
df2 <- split(df2, df2$group)
# Then, we loop over the data and cast the data
df2 <- lapply(seq(df2), function(i){
dat <- df2[[i]]
cbind(reshape2::dcast(dat, Mês ~ variable), group = i)
})
# And finally combine the data.frame back together
df2 <- do.call(rbind, df2)
Now the data should look like this:
head(df2)
Mês Max Mean Min Std group
1 Jan 27.50 25.45 13.05 2.04 1
2 Feb 27.85 25.74 13.02 2.09 1
3 Mar 27.27 25.01 12.12 2.13 1
4 Apr 25.48 23.16 9.89 2.19 1
5 May 23.59 21.17 7.99 2.21 1
6 Jun 22.34 19.88 6.37 2.26 1
Data in this format is the easiest to plot with. We'll do that as follows:
# First we define all shared aesthetics in the main 'ggplot'-call:
ggplot(df2, aes(x = Mês,
group = as.factor(group),
colour = as.factor(group))) +
# Then as lowest layer, we want that area spanning 'Min' to 'Max'
geom_ribbon(aes(ymin = Min,
ymax = Max,
fill = as.factor(group)), alpha = 0.1) +
# Then we want our means displayed as points
geom_point(aes(y = Mean, shape = as.factor(group))) +
# The standard deviation as line segments with an arrowhead
geom_segment(aes(xend = Mês,
y = Mean - Std,
yend = Mean + Std),
arrow = arrow(angle = 90, ends = "both", length = unit(2, "mm"))) +
# Finally we tell that our point shapes should be dots, squares and crosses
scale_shape_manual(values = c(16, 15, 4))
And in my hands this yielded the following:
Now, as a last tip: if you want more people to help you or get help quicker, it is easiest to give them some data to play around with that they can copy-paste directly in R:
dput(df)
structure(list(Mês = structure(1:12, .Label = c("Jan", "Feb",
"Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov",
"Dec"), class = "factor"), Mean1 = c(25.45, 25.74, 25.01, 23.16,
21.17, 19.88, 19.41, 20.39, 21.08, 22.19, 23.42, 24.39), Std1 = c(2.04,
2.09, 2.13, 2.19, 2.21, 2.26, 2.27, 2.1, 1.96, 1.81, 1.9, 1.98
), Min1 = c(13.05, 13.02, 12.12, 9.89, 7.99, 6.37, 5.78, 7.73,
9.26, 11.33, 11.94, 12.39), Max1 = c(27.5, 27.85, 27.27, 25.48,
23.59, 22.34, 21.79, 22.64, 23.29, 24.32, 25.52, 26.39), Mean2 = c(25.83,
26.16, 25.35, 23.81, 21.31, 20.15, 19.96, 20.75, 21.66, 23.17,
23.89, 25.17), Std2 = c(1.94, 2.01, 2.14, 2.35, 2.29, 2.25, 2.1,
2.03, 1.58, 1.62, 1.64, 1.99), Min2 = c(14.01, 13.95, 12.41,
9.62, 7.54, 6.65, 7.34, 8.56, 12.21, 13.4, 13.96, 13.07), Max2 = c(27.85,
28.16, 27.67, 26.35, 23.88, 22.65, 22.25, 23, 23.53, 25, 25.68,
27.54), Mean3 = c(25.54, 25.92, 25.16, 23.51, 21.18, 20, 19.6,
20.53, 21.33, 22.6, 23.6, 24.67), Std3 = c(2.03, 2.04, 2.07,
2.17, 2.23, 2.26, 2.22, 2.09, 1.91, 1.79, 1.82, 1.94), Min3 = c(13.24,
13.58, 12.68, 10.46, 7.84, 6.42, 6.24, 7.93, 9.84, 11.92, 12.63,
12.93), Max3 = c(27.58, 27.99, 27.45, 25.9, 23.67, 22.47, 22.02,
22.8, 23.53, 24.76, 25.67, 26.73)), row.names = c(NA, -12L), class = "data.frame")

Splitting two messy vectors in a data frame into one common column

Sample of dataset:
library(dplyr)
sample <- structure(list(Rank = c(15, 17, 20, 2, 16, 8, 21, 5, 13, 31, 22, 18, 2, 19, 11, 11, 8, 7, 12, 9, 5, 23, 17, 16, 15, 14, 4, 20, 13, 2), Athlete = c("François Gourmet(BEL)", "Agustín Félix(ESP)", "Keisuke Ushiro", "Michael Schrader", "Pieter Braun", "Laurent Hernu(FRA)", "Dmitriy Karpov", "Laurent Hernu(FRA)", "Thomas van der Plaetsen", "Attila Szabó", "Nadir El Fassi", "Eduard Mikhan", "Leonel Suárez", "Janek Õiglane", "Hans van Alphen(BEL)", "Roman Šebrle", "André Niklaus(GER)", "Pascal Behrenbruch", "Pieter Braun", "Oleksandr Yurkov(UKR)", "Eelco Sintnicolaas", "Brent Newdick", "Kim Kun-woo", "Akihiko Nakamura", "Bastien Auzeil", "Frédéric Xhonneux", "Janek Õiglane", "Keisuke Ushiro", "Roman Šebrle", "Rico Freimuth"), Total = c(7974, 7749, 7498, 8670, 7890, 8280, 7550, 8218, 8069, 7610, 7922, 7968, 8640, 7581, 8034, 8266, 8020, 8211, 8114, 8264, 8298, 7915, 7860, 7745, 7922, 7616, 8371, 7532, 8069, 8564), `100m` = c(10.67, 11.17, 11.53, 10.73, 11.22, 10.97, 11.24, 11.2, 11.2, 11.15, 11.12, 10.97, 11.13, 11.51, 11.11, 11.16, 11.19, 11.08, 11.11, 10.93, 10.76, 11.11, 11.11, 10.86, 11.35, 11.28, 11.08, 11.51, 11.25, 10.53), LJ = c(7.15, 7.12, 6.64, 7.85, 7.17, 7.31, 6.86, 7.22, 7.79, 7.09, 7.26, 7.42, 7.24, 6.78, 7.35, 7.8, 7.21, 6.8, 7.29, 7.37, 7.29, 7.42, 7.24, 7.26, 6.87, 7.21, 7.33, 6.73, 7.3, 7.48), SP = c(13.74, 13.29, 13.43, 14.56, 14.48, 14.43, 15.69, 13.99, 12.76, 13.92, 13.62, 14.15, 15.2, 14.43, 14.67, 14.98, 13.87, 16.01, 13.9, 15.15, 14.13, 14.35, 12.96, 11.67, 15.23, 12.92, 15.13, 14.93, 15.2, 14.85), HJ = c(1.85, 2.03, 1.96, 1.99, 1.93, 2.03, 1.93, 2.03, 2.17, 1.84, 1.99, 1.96, 2.11, 1.92, 1.88, 2.11, 1.97, 1.93, 2.04, 1.97, 1.93, 1.99, 1.96, 1.95, 1.96, 2.03, 2.05, 1.89, 2.05, 1.99), `400m` = c(47.98, 52.08, 51.43, 47.66, 48.54, 49.31, 52.01, 48.95, 49.46, 49.79, 51.35, 48.8, 48, 50.95, 48.52, 50.42, 49.95, 49.9, 48.24, 49.45, 48.35, 50.1, 49.24, 47.81, 50.36, 49.04, 49.58, 50.85, 51.18, 48.41), `110mh` = c(15.02, 14.75, 15.35, 14.29, 14.67, 14.01, 14.64, 14.15, 14.79, 14.65, 14.9, 14.82, 14.45, 15.33, 14.77, 14.44, 14.5, 14.33, 14.37, 14.41, 14.42, 14.82, 14.95, 14.72, 14.59, 15.75, 14.56, 15.43, 14.75, 13.68), DT = c(39.87, 43.67, 47.64, 46.44, 42.59, 43.93, 47.1, 46.13, 37.2, 43.75, 42.25, 48, 44.71, 40.94, 44.3, 46.3, 42.68, 48.56, 42.09, 48.1, 42.23, 43.6, 39.53, 33.48, 46.86, 38.62, 42.11, 46.85, 46.93, 51.17), PV = c(5, 5, 4.6, 5, 4.7, 5.1, 4.8, 4.9, 5.1, 4.4, 4.8, 4.6, 5, 4.6, 4.3, 4.6, 5.1, 4.9, 4.9, 5, 5.2, 4.8, 4.9, 4.7, 4.8, 4.7, 5.1, 4.7, 4.8, 4.8), JT = c(57.73, 56.69, 63.28, 65.67, 59.26, 59.9, 46.91, 59.63, 58.91, 59.56, 57.65, 50.74, 75.19, 68.51, 65.71, 65.61, 57.55, 66.5, 56.95, 58.63, 61.07, 51.52, 53.33, 53.57, 60.8, 50.18, 71.73, 56.52, 67.28, 62.34), `1500m` = c(265.51, 288.27, 291.9, 265.38, 278.4, 277.41, 298.41, 268.4, 285.86, 285.64, 256.51, 273.71, 267.25, 283.06, 262.5, 290.33, 268.8, 276.64, 272.46, 278.43, 265.4, 270.57, 255.63, 256.36, 279.8, 262.71, 279.24, 283.51, 296.5, 281.57), Year = structure(c(4L, 4L, 9L, 7L, 9L, 1L, 6L, 2L, 6L, 5L, 5L, 7L, 5L, 8L, 4L, 5L, 2L, 6L, 8L, 1L, 6L, 5L, 6L, 8L, 9L, 3L, 9L, 8L, 6L, 9L), .Label = c("2001", "2003", "2005", "2007", "2009", "2011", "2013", "2015", "2017"), class = "factor"), Nationality = c(NA, NA, "Japan(JPN)", "Germany(GER)", "Netherlands(NED)", NA, "Kazakhstan(KAZ)", NA, "Belgium(BEL)", "Hungary", "France", "Belarus(BLR)", "Cuba", "Estonia(EST)", NA, "Czech Republic", NA, "Germany(GER)", "Netherlands(NED)", NA, "Netherlands(NED)", "New Zealand", "South Korea(KOR)", "Japan(JPN)", "France(FRA)", NA, "Estonia(EST)", "Japan(JPN)", "Czech Republic(CZE)", "Germany(GER)"), Notes = c(NA, NA, NA, "PB", NA, NA, NA, NA, NA, NA, "SB", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "PB", "NR", NA, "SB", NA, "PB", NA, NA, NA)), .Names = c("Rank", "Athlete", "Total", "100m", "LJ", "SP", "HJ", "400m", "110mh", "DT", "PV", "JT", "1500m", "Year", "Nationality", "Notes"), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 30 x 16
Rank Athlete Total `100m` LJ SP HJ `400m` `110mh` DT PV JT `1500m` Year Nationality Notes
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <chr> <chr>
1 15 François Gourmet(BEL) 7974 10.67 7.15 13.74 1.85 47.98 15.02 39.87 5.0 57.73 265.51 2007 <NA> <NA>
2 17 Agustín Félix(ESP) 7749 11.17 7.12 13.29 2.03 52.08 14.75 43.67 5.0 56.69 288.27 2007 <NA> <NA>
3 20 Keisuke Ushiro 7498 11.53 6.64 13.43 1.96 51.43 15.35 47.64 4.6 63.28 291.90 2017 Japan(JPN) <NA>
4 2 Michael Schrader 8670 10.73 7.85 14.56 1.99 47.66 14.29 46.44 5.0 65.67 265.38 2013 Germany(GER) PB
5 16 Pieter Braun 7890 11.22 7.17 14.48 1.93 48.54 14.67 42.59 4.7 59.26 278.40 2017 Netherlands(NED) <NA>
6 8 Laurent Hernu(FRA) 8280 10.97 7.31 14.43 2.03 49.31 14.01 43.93 5.1 59.90 277.41 2001 <NA> <NA>
7 21 Dmitriy Karpov 7550 11.24 6.86 15.69 1.93 52.01 14.64 47.10 4.8 46.91 298.41 2011 Kazakhstan(KAZ) <NA>
8 5 Laurent Hernu(FRA) 8218 11.20 7.22 13.99 2.03 48.95 14.15 46.13 4.9 59.63 268.40 2003 <NA> <NA>
9 13 Thomas van der Plaetsen 8069 11.20 7.79 12.76 2.17 49.46 14.79 37.20 5.1 58.91 285.86 2011 Belgium(BEL) <NA>
10 31 Attila Szabó 7610 11.15 7.09 13.92 1.84 49.79 14.65 43.75 4.4 59.56 285.64 2009 Hungary <NA>
# ... with 20 more rows
I have two character vectors, "Athlete and "Nationality", in my dataset where some entries have country codes in brackets attached at the end. I want to be able to split only the country codes from these two vectors into a new variable, say "countrycode", while getting rid of the brackets at the same time. I'm not sure what the best way or syntax to go about splitting would be though - dplyr::separate possibly? Though I'm uncertain how to incorporate the combinations of characters in the country codes within the brackets during the split, and the fact that some entries don't need splitting.
I would then do something like this after to remove the brackets from the new variable.
sample$countrycode<- gsub(pattern="\\(",replacement="",x=sample$countrycode)
sample$countrycode<- gsub(pattern="\\)",replacement="",x=sample$countrycode)
Thanks
Hope this works for you:
library(dplyr)
res <- sample %>% mutate(
countrycode = case_when(
is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality),
TRUE ~ Nationality
)
)
sample output:
res %>% select(Athlete, Nationality, countrycode)
# # A tibble: 30 x 3
# Athlete Nationality countrycode
# <chr> <chr> <chr>
# 1 François Gourmet(BEL) NA BEL
# 2 Agustín Félix(ESP) NA ESP
# 3 Keisuke Ushiro Japan(JPN) JPN
# 4 Michael Schrader Germany(GER) GER
# 5 Pieter Braun Netherlands(NED) NED
# 6 Laurent Hernu(FRA) NA FRA
# 7 Dmitriy Karpov Kazakhstan(KAZ) KAZ
# 8 Laurent Hernu(FRA) NA FRA
# 9 Thomas van der Plaetsen Belgium(BEL) BEL
# 10 Attila Szabó Hungary Hungary
# # ... with 20 more rows
Remove the TRUE ~ Nationality to extract only country code as commented by Frank:
sample %>% mutate(
countrycode = case_when(
is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality)
))
An ugly approach would be to use sub:
library(data.table)
DT = data.table(sample)
patt = "^.*\\((.{3})\\).*$"; rp = "\\1"
DT[Athlete %like% patt, cc := sub(patt, rp, Athlete)]
DT[Nationality %like% patt, cc := sub(patt, rp, Nationality)]
Something like str_extract from the stringr package would probably be cleaner if you're already working with tidyverse packages. Also, for the dplyr analogue to the code above, maybe look at the case_when function. (I am not familiar enough with these tools to know the exact syntax.)
The result looks like...
> DT[, .(Athlete, Nationality, cc)]
Athlete Nationality cc
1: François Gourmet(BEL) NA BEL
2: Agustín Félix(ESP) NA ESP
3: Keisuke Ushiro Japan(JPN) JPN
4: Michael Schrader Germany(GER) GER
5: Pieter Braun Netherlands(NED) NED
6: Laurent Hernu(FRA) NA FRA
7: Dmitriy Karpov Kazakhstan(KAZ) KAZ
8: Laurent Hernu(FRA) NA FRA
9: Thomas van der Plaetsen Belgium(BEL) BEL
10: Attila Szabó Hungary NA
11: Nadir El Fassi France NA
12: Eduard Mikhan Belarus(BLR) BLR
13: Leonel Suárez Cuba NA
14: Janek Õiglane Estonia(EST) EST
15: Hans van Alphen(BEL) NA BEL
16: Roman Šebrle Czech Republic NA
17: André Niklaus(GER) NA GER
18: Pascal Behrenbruch Germany(GER) GER
19: Pieter Braun Netherlands(NED) NED
20: Oleksandr Yurkov(UKR) NA UKR
21: Eelco Sintnicolaas Netherlands(NED) NED
22: Brent Newdick New Zealand NA
23: Kim Kun-woo South Korea(KOR) KOR
24: Akihiko Nakamura Japan(JPN) JPN
25: Bastien Auzeil France(FRA) FRA
26: Frédéric Xhonneux NA NA
27: Janek Õiglane Estonia(EST) EST
28: Keisuke Ushiro Japan(JPN) JPN
29: Roman Šebrle Czech Republic(CZE) CZE
30: Rico Freimuth Germany(GER) GER
Athlete Nationality cc
This simple solution works too.
library(stringr)
data1$country_code <- sapply(data1$Nationality, function(x) unlist(stri_extract_all(str = x, regex = '([A-Z]+)'))[2])
Nationality country_code
1: NA NA
2: NA NA
3: Japan(JPN) JPN
4: Germany(GER) GER
5: Netherlands(NED) NED
6: NA NA

Inserting value in NAs repetitively in order [duplicate]

This question already has answers here:
Replacing NAs with latest non-NA value
(21 answers)
Closed 5 years ago.
I have a problem of making my data complete. Below is my data
> head(DF1)
# A tibble: 6 x 4
Date Coalprice Gasprice Co2emissionprice
<date> <dbl> <dbl> <dbl>
1 2015-12-31 47.45 14.40 8.22
2 2015-12-30 47.45 14.30 8.22
3 2015-12-29 47.40 15.40 8.27
4 2015-12-28 47.00 14.42 8.32
5 2015-12-25 47.00 14.20 8.22
6 2015-12-24 47.00 14.20 8.22
So data goes down all the way down to 2011-01-01 from 2015-12-31. But now, if you look carefully, my data has regular missing values. Every weekend's value is missing. So I want to put the prices for weekends as well to fill up NA. What I want to do is fill up every weekend (Sat and Sun) with the same prices on a day before every weekend, so Friday.
So in this example, 2015-12-25' prices 47 14.20 8.22 will go to Sat and Sun as well. Then next weekend's prices will be the same as Friday in that week.
Can you guys help me out with syntax?
Thank you very much for your advice.
dput info is below:
> dput(head(DF1, 30))
structure(list(Date = structure(c(16800, 16799, 16798, 16797,
16794, 16793, 16792, 16791, 16790, 16787, 16786, 16785, 16784,
16783, 16780, 16779, 16778, 16777, 16776, 16773, 16772, 16771,
16770, 16769, 16766, 16765, 16764, 16763, 16762, 16759), class = "Date"),
Coalprice = c(47.45, 47.45, 47.4, 47, 47, 47, 47, 47.6, 47.6,
47.8, 47.75, 47.75, 47.7, 47.65, 47.35, 47.4, 47.45, 47.4,
47.75, 48.55, 48.95, 49.1, 49.7, 49.95, 50.3, 53.85, 53.95,
53.95, 54, 54.35), Gasprice = c(14.4, 14.3, 15.4, 14.42,
14.2, 14.2, 13.93, 13.85, 14.35, 14.9, 15.5, 15.25, 15.95,
16.08, 16.23, 16.5, 16.65, 16.75, 16.78, 17.15, 17.15, 17.85,
17.95, 18.2, 17.7, 17.7, 17.88, 17.7, 17.6, 17.5), Co2emissionprice = c(8.22,
8.22, 8.27, 8.32, 8.22, 8.22, 8.22, 8.25, 8.18, 8.07, 8.07,
8.12, 8.19, 8.09, 8.07, 8.36, 8.4, 8.42, 8.42, 8.52, 8.58,
8.49, 8.55, 8.58, 8.56, 8.58, 8.62, 8.65, 8.56, 8.51)), .Names = c("Date",
"Coalprice", "Gasprice", "Co2emissionprice"), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
You can use tidyr packages to do this in a single line.
library(tidyr)
df <- fill(df, contains("price"), .direction = "down")

Resources