Removing blank rows in the two data frames and combine them - r

I have a dataset which contains two groups. First 3 columns are 1st group and next 3 columns are 2nd group. They contains missing values at random manner.
I have to delete the rows containing complete missing values in any one group. And both group contains at least one value in the row.
At last I have to combine both the groups.
I have tried many R codes. Please suggest some useful R function for this issue.
example data structure

If your data is properly named, this can be done using starts_with and if_any (otherwise you might rename your columns first as you see fit)
library(tidyverse)
df <- tribble(
~x1, ~x2, ~x3, ~y1, ~y2, ~y3,
26.4, 26.5, 26.6, 26.7, 26.4, 26.5,
NA, NA, NA, 23.7, NA, NA,
27.2, 28.0, 27.9, 27.6, 27.8, 27.7,
NA, 24.2, 24.9, 23.9, 24.9, 24.0,
24.3, NA, 24.3, 24.0, 24.1, 24.5,
26.9, 26.7, 27.0, 26.9, 26.8, 26.8,
24.4, 24.4, 24.5, 24.8, 24.3, 24.3,
NA, NA, NA, 23.9, NA, NA,
NA, NA, NA, 23.9, NA, NA,
24.9, NA, NA, 24.9, NA, NA,
NA, NA, NA, 24.5, NA, NA,
28.3, 28.2, 28.3, 28.2, 28.4, 28.3,
28.3, 28.4, 28.1, 28.3, 28.3, 28.2
)
df %>% filter(!if_all(starts_with("x"), is.na) & !if_all(starts_with("y"), is.na))
#> # A tibble: 9 × 6
#> x1 x2 x3 y1 y2 y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 26.4 26.5 26.6 26.7 26.4 26.5
#> 2 27.2 28 27.9 27.6 27.8 27.7
#> 3 NA 24.2 24.9 23.9 24.9 24
#> 4 24.3 NA 24.3 24 24.1 24.5
#> 5 26.9 26.7 27 26.9 26.8 26.8
#> 6 24.4 24.4 24.5 24.8 24.3 24.3
#> 7 24.9 NA NA 24.9 NA NA
#> 8 28.3 28.2 28.3 28.2 28.4 28.3
#> 9 28.3 28.4 28.1 28.3 28.3 28.2
Created on 2022-06-18 by the reprex package (v2.0.1)

I am not sure I understand your question, but here is a demonstration using dplyr::if_all() , dplyr::if_any()
library(tidyverse)
# Example data
# have to delete the rows containing complete missing values in any one group.
# And both group contains at least one value in the row. (Not sure what that means)
# At last I have to combine both the groups.
d <- tibble::tribble(
~gr1_col1, ~gr1_col2, ~gr1_col3, ~gr2_col1, ~gr2_col2, ~gr2_col3,
1, 2, NA, 1, 1, 1,
NA, NA, NA, NA, 1, 1,
NA, 1, 1, NA, NA, 1,
1, NA, 2, NA, NA, NA,
)
d
#> # A tibble: 4 x 6
#> gr1_col1 gr1_col2 gr1_col3 gr2_col1 gr2_col2 gr2_col3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 NA 1 1 1
#> 2 NA NA NA NA 1 1
#> 3 NA 1 1 NA NA 1
#> 4 1 NA 2 NA NA NA
d %>%
dplyr::filter(
# First group
!dplyr::if_all(.cols = c(1, 2, 3), .fns = is.na), # removing rows if all columns 1, 2 and 3 are NA
# second group
!if_all(.cols = c(4, 5, 6), .fns = is.na) # removing rows if all columns 1, 2 and 3 are NA
)
#> # A tibble: 2 x 6
#> gr1_col1 gr1_col2 gr1_col3 gr2_col1 gr2_col2 gr2_col3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 NA 1 1 1
#> 2 NA 1 1 NA NA 1
# Not sure what you mean with how you want to combine groups
Created on 2022-06-17 by the reprex package (v2.0.1)

Related

How to find specific multi-variable thresholds for AUC cutpoint?

I want to figure out how to find the variable cutoff points for a glm. Cutpointr does this really well for single variable but with more complex models how would I go about extracting this info? Say I did ~ glucose + age + mass and wanted at this cutoff values x, y and z are present or is this not possible? would it be more like this number is the summed cutoff of the variables? Also sorry for no images it wouldn't let me add them in!
#for reprex
library(reprex)
library(mlbench)
library(tidymodels)
library(cutpointr)
#>
#> Attaching package: 'cutpointr'
#> The following objects are masked from 'package:yardstick':
#>
#> accuracy, npv, ppv, precision, recall, sensitivity, specificity
library(ggplot2)
library(tidyverse)
data(PimaIndiansDiabetes2)
head(PimaIndiansDiabetes2)
#> pregnant glucose pressure triceps insulin mass pedigree age diabetes
#> 1 6 148 72 35 NA 33.6 0.627 50 pos
#> 2 1 85 66 29 NA 26.6 0.351 31 neg
#> 3 8 183 64 NA NA 23.3 0.672 32 pos
#> 4 1 89 66 23 94 28.1 0.167 21 neg
#> 5 0 137 40 35 168 43.1 2.288 33 pos
#> 6 5 116 74 NA NA 25.6 0.201 30 neg
Diabetes <- na.omit(PimaIndiansDiabetes2)
glimpse(PimaIndiansDiabetes2)
#> Rows: 768
#> Columns: 9
#> $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
#> $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
#> $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N…
#> $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N…
#> $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17…
#> $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.…
#> $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
#> $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
#> $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
cp <- cutpointr(Diabetes, glucose, diabetes,
method = maximize_metric, metric = sum_sens_spec)
#> Assuming the positive class is pos
#> Assuming the positive class has higher x values
plot_cutpointr(cp)
summary(cp)
#> Method: maximize_metric
#> Predictor: glucose
#> Outcome: diabetes
#> Direction: >=
#>
#> AUC n n_pos n_neg
#> 0.8058 392 130 262
#>
#> optimal_cutpoint sum_sens_spec acc sensitivity specificity tp fn fp tn
#> 128 1.5055 0.7628 0.7231 0.7824 94 36 57 205
#>
#> Predictor summary:
#> Data Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
#> Overall 56 81.0 99.00 119.0 122.6276 143.00 181.00 198 30.86078 0
#> neg 56 79.0 94.00 107.5 111.4313 126.00 154.00 197 24.64213 0
#> pos 78 95.9 124.25 144.5 145.1923 171.75 188.55 198 29.83939 0
res_unnested <- cp %>%
unnest(cols = roc_curve)
annotation <- paste0("AUC: ", round(cp$AUC, 2), "\n",
"Cutpoint: ", round(cp$optimal_cutpoint, 2))
ggplot(res_unnested, aes(x = 1 - tnr, y = tpr)) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_bw() +
theme(aspect.ratio = 1) +
geom_line(color = "red") +
geom_vline(xintercept = 1 - cp$specificity, linetype = "dotted") +
geom_hline(yintercept = cp$sensitivity, linetype = "dotted") +
annotate("text", x = 0.85, y = 0.05, label = annotation) +
ggtitle("ROC curve", "Using glucose mg/dL as a predictive logistic variable for diabetes") +
geom_abline(intercept = 0, slope = 1, linetype = 2)
ROC(form = diabetes ~ glucose + age + mass, data=Diabetes, plot = "ROC", MX = T)
I have tried to add more parameters to cutpointr which was unsuccessful. I have also tried to run with Epi and saw a better AUC with age and mass included. I have also run a glm but I am just not sure how to properly analyze the glm for this type of information. Looking on the tidymodels website for help while waiting for suggestions, thanks!

How to split df (group_split or spit.data.frame) naming the new dfs

First of all I've seen several questions similar to it but not specifying the name of the dfs
My df
structure(list(paciente = structure(c(6163, 6553, 6357, 6331,
6228, 6280, 6383, 6198, 6316, 6148), label = "Paciente", format.spss = "F6.0"),
grupo_int_v00 = structure(c(1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L), .Label = c("A", "B"), label = "Grupo de intervención", class = "factor"),
time = structure(c(3L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("00", "01", "66"), class = "factor"), peso1 = c(76.3,
95.4, 103.5, 82.1, 103.3, 77.6, 76, 88.3, 64, 101), cintura1 = c(104.5,
120, 116, 104, 120.5, 104, NA, 110, 104, 119), tasis2_e = c(145,
137, 123, 153, 131, 147, NA, 147, 121, 131), tadias2_e = c(64,
61, 76, 75, 65, 84, NA, 76, 59, 96), p17_total = c(12, 3,
9, 6, 8, 10, 9, 7, 12, 9), geaf_tot = c(1678.32, 1398.6,
1566.43, 587.41, 4876.46, 3776.22, 1762.24, 3188.81, 7192.54,
1678.32), glucosa = c(273, 149, 96, 115, 101, 94, NA, 125,
104, 107), albumi = c(4.15, 4.75, 4.59, 4.83, 4.64, 4.49,
NA, 4.71, 4.33, 4.09), coltot = c(137, 174, 252, 270, 211,
164, NA, 192, 281, 234), hdl = c(30, 56, 45, 74, 66, 51,
NA, 34, 62, 44), ldl_calc = c(51, 95, NA, 177, 127, 90, NA,
130, 186, 170), trigli = c(280, 114, 309, 96, 89, 115, NA,
139, 165, 99), hba1c = c(13.77, 6.57, 5.65, 6.52, 5.69, 6.02,
NA, 6.25, 5.95, 5.93), i_hucpeptide = c(3567.05, 1407.53,
1259.29, 1028.31, 649.19, 893.52, NA, 815.82, 342.68, NA),
i_hughrelin = c(1214.83, 874.6, 1015.68, 919.51, 456.28,
650.22, NA, 143.32, 1159.1, NA), i_hugip = c(2.67, 2.67,
2.67, 2.67, 2.67, 2.67, NA, 2.67, 2.67, NA), i_huglp1 = c(538.62,
264.67, 106.76, 164.82, 141.23, 14.14, NA, 112.57, 14.14,
NA), i_huglucagon = c(720.19, 801.94, 321.68, 629.04, 186.88,
238.33, NA, 238, 265.84, NA), i_huinsulin = c(1646.21, 545.57,
297.96, 333.05, 232.17, 263.55, NA, 263.87, 136.97, NA),
i_huleptin = c(8476.58, 10680.93, 6034.91, 14225.58, 2160.27,
2778.49, NA, 2829.59, 6102.63, NA), i_hupai1 = c(3787.2,
2401.66, 1040.35, 2123.09, 1625.27, 1932.06, NA, 2483.08,
919.81, NA), i_huresistin = c(11350.35, 5171.75, 5794.31,
2814.22, 2994.15, 3215.24, NA, 2577.84, 3227.73, NA), i_huvisfatin = c(1652.92,
2125.95, 407.98, 3544.59, 8.64, 132.49, NA, 8.64, 189.96,
NA), col_rema = c(56, 23, NA, 19, 18, 23, NA, 28, 33, 20),
homa = c(19974.0146666667, 3612.88577777778, 1271.296, 1702.25555555556,
1042.18533333333, 1101.05333333333, NA, 1465.94444444444,
633.105777777778, NA), i_pcr = c(0.39, 0.57, 0.04, 0.22,
0.04, 1.01, NA, 0.1, 0.04, NA), i_ratiolg = c(6.97758534115885,
12.2123599359707, 5.94174346250788, 15.4708268534328, 4.73452704479705,
4.273153701824, NA, 19.743162154619, 5.26497282374256, NA
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
The df looks like:
paciente grupo_int_v00 time peso1 cintura1 tasis2_e tadias2_e
<dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 6163 A 66 76.3 104. 145 64
2 6553 A 01 95.4 120 137 61
3 6357 B 01 104. 116 123 76
4 6331 B 00 82.1 104 153 75
5 6228 A 01 103. 120. 131 65
6 6280 B 01 77.6 104 147 84
split(df, df$grupo_int_v00) %>% list2env(envir = globalenv())
I am generating dfs with the name of the grupo_int_v00, now imagine I want to set my name of the df, how can I do that? Because my grupo_int_v00 here is A or B, but usually are numbers, and I don't want them to be numbers, I want them to be group_A , group_B, group_C ...
Not sure if you can provide with dplyr option (group_split and set_names or similar)
Thank you
You can change the names with paste + set_names:
library(magrittr)
split(df, df$grupo_int_v00) %>%
set_names(paste("group", names(.), sep = "_"))
The base R alternative is:
spl <- split(df, df$grupo_int_v00)
names(spl) <- paste("group", names(spl), sep = "_")
An even shorter alternative is to add the paste call in the split function:
split(df, ~ paste0("group_", grupo_int_v00))
output
> spl
$group_A
# A tibble: 6 × 30
paciente grupo_i…¹ time peso1 cintu…² tasis…³ tadia…⁴ p17_t…⁵ geaf_…⁶ glucosa albumi coltot
<dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6163 A 66 76.3 104. 145 64 12 1678. 273 4.15 137
2 6553 A 01 95.4 120 137 61 3 1399. 149 4.75 174
3 6228 A 01 103. 120. 131 65 8 4876. 101 4.64 211
4 6383 A 01 76 NA NA NA 9 1762. NA NA NA
5 6198 A 01 88.3 110 147 76 7 3189. 125 4.71 192
6 6316 A 01 64 104 121 59 12 7193. 104 4.33 281
# … with 18 more variables: hdl <dbl>, ldl_calc <dbl>, trigli <dbl>, hba1c <dbl>,
# i_hucpeptide <dbl>, i_hughrelin <dbl>, i_hugip <dbl>, i_huglp1 <dbl>, i_huglucagon <dbl>,
# i_huinsulin <dbl>, i_huleptin <dbl>, i_hupai1 <dbl>, i_huresistin <dbl>,
# i_huvisfatin <dbl>, col_rema <dbl>, homa <dbl>, i_pcr <dbl>, i_ratiolg <dbl>, and
# abbreviated variable names ¹​grupo_int_v00, ²​cintura1, ³​tasis2_e, ⁴​tadias2_e, ⁵​p17_total,
# ⁶​geaf_tot
# ℹ Use `colnames()` to see all variable names
$group_B
# A tibble: 4 × 30
paciente grupo_i…¹ time peso1 cintu…² tasis…³ tadia…⁴ p17_t…⁵ geaf_…⁶ glucosa albumi coltot
<dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6357 B 01 104. 116 123 76 9 1566. 96 4.59 252
2 6331 B 00 82.1 104 153 75 6 587. 115 4.83 270
3 6280 B 01 77.6 104 147 84 10 3776. 94 4.49 164
4 6148 B 01 101 119 131 96 9 1678. 107 4.09 234
# … with 18 more variables: hdl <dbl>, ldl_calc <dbl>, trigli <dbl>, hba1c <dbl>,
# i_hucpeptide <dbl>, i_hughrelin <dbl>, i_hugip <dbl>, i_huglp1 <dbl>, i_huglucagon <dbl>,
# i_huinsulin <dbl>, i_huleptin <dbl>, i_hupai1 <dbl>, i_huresistin <dbl>,
# i_huvisfatin <dbl>, col_rema <dbl>, homa <dbl>, i_pcr <dbl>, i_ratiolg <dbl>, and
# abbreviated variable names ¹​grupo_int_v00, ²​cintura1, ³​tasis2_e, ⁴​tadias2_e, ⁵​p17_total,
# ⁶​geaf_tot
# ℹ Use `colnames()` to see all variable names

Finding distance between a row and the row two above it in R

I would like to efficiently compute distances between every row in a matrix and the row two rows above it in R...
My attempts at finding a dplyr rowwise solution with lag(., n = 2) have failed, and I'm sure there's a better solution than this for loop.
Thoughts are much appreciated!
library(rdist)
library(tidyverse)
structure(list(sodium = c(140, 152.6, 138, 152.4, 140, 152.6,
141, 152.7, 141, 152.7), chloride = c(103, 148.9, 104, 149, 102,
148.8, 103, 148.9, 104, 149), potassium_plas = c(3.4, 0.34, 4.1,
0.41, 3.7, 0.37, 4, 0.4, 3.7, 0.37), co2_totl = c(31, 3.1, 22,
2.2, 23, 2.3, 27, 2.7, 20, 2), bun = c(11, 1.1, 5, 0.5, 8, 0.8,
21, 2.1, 10, 1), creatinine = c(0.84, 0.084, 0.53, 0.053, 0.69,
0.069, 1.04, 0.104, 1.86, 0.186), calcium = c(9.3, 0.93, 9.8,
0.98, 9.4, 0.94, 9.4, 0.94, 9.1, 0.91), glucose = c(102, 10.2,
99, 9.9, 115, 11.5, 94, 9.4, 122, 12.2), anion_gap = c(6, 0.599999999999989,
12, 1.20000000000001, 15, 1.50000000000001, 11, 1.09999999999998,
17, 1.69999999999999)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
dist_prior <- rep(NA, n = nrow(input_labs))
for(i in 3:nrow(input_labs)){
dist_prior[i] <- cdist(input_labs[i,], input_labs[i-2,])
}
We could loop over the sequence of rows in map and apply the function, append NAs at the beginning to make the length correct
library(dplyr)
library(rdist)
library(purrr)
input_labs %>%
mutate(dist_prior = c(NA_real_, NA_real_,
map_dbl(3:n(), ~ cdist(cur_data()[.x,], cur_data()[.x-2, ]))))
-output
# A tibble: 10 × 10
sodium chloride potassium_plas co2_totl bun creatinine calcium glucose anion_gap dist_prior
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 140 103 3.4 31 11 0.84 9.3 102 6 NA
2 153. 149. 0.34 3.1 1.1 0.084 0.93 10.2 0.600 NA
3 138 104 4.1 22 5 0.53 9.8 99 12 13.0
4 152. 149 0.41 2.2 0.5 0.053 0.98 9.9 1.20 1.30
5 140 102 3.7 23 8 0.69 9.4 115 15 16.8
6 153. 149. 0.37 2.3 0.8 0.069 0.94 11.5 1.50 1.68
7 141 103 4 27 21 1.04 9.4 94 11 25.4
8 153. 149. 0.4 2.7 2.1 0.104 0.94 9.4 1.10 2.54
9 141 104 3.7 20 10 1.86 9.1 122 17 31.5
10 153. 149 0.37 2 1 0.186 0.91 12.2 1.70 3.15
Or may split by row on the original data and the laged one and use map2 to loop over the list and apply
input_labs$dist_prior <- map2_dbl(
asplit(lag(input_labs, n = 2), 1),
asplit(input_labs, 1),
~ cdist(as.data.frame.list(.x), as.data.frame.list(.y))[,1])
in Base R you can use diff and rowSums as shown below:
c(NA, NA, sqrt(rowSums(diff(as.matrix(input_labs), 2)^2)))
[1] NA NA 12.955157 1.295516 16.832873 1.683287 25.381342 2.538134 31.493688 3.149369
You can cbind the results to the original dataframe.

Replacing values based on conditions

I have a dataframe one of the cols is id and some of the values have been messed up during the recording of the data.
here's an example of the type of data
dput(df)
structure(list(Id = c("'110171786'", "'1103fbfd5'", "'0700edf6dc'",
"'1103fad09'", "'01103fc9bb'", "''", "''", "0000fba2b'", "'01103fb169'",
"'01103fd723'", "'01103f9c34'", "''", "''", "''", "'01103fc088'",
"'01103fa6d8'", "'01103fb374'", "'01103fce8c'", "'01103f955d'",
"'011016e633'", "'01103fa0da'", "''", "''", "''", "'01103fa4bd'",
"'01103fb5c4'", "'01103fd0d7'", "'01103f9e2e'", "'01103fc657'",
"'01103fd4d1'", "'011016e78e'", "'01103fbda2'", "'01103fbae7'",
"'011016ee23'", "'01103fc847'", "'01103fbfbb'", "''", "'01103fb8bb'",
"'01103fc853'", "''", "'01103fbcd5'", "'011016e690'", "'01103fb253'",
"'01103fcb19'", "'01103fb446'", "'01103fa4fa'", "'011016cfbd'",
"'01103fd250'", "'01103fac7d'", "'011016a86e'"), Weight = c(11.5,
11.3, 11.3, 10.6, 10.6, 8.9, 18.7, 10.9, 11.3, 18.9, 18.9, 8.6,
8.8, 8.4, 11, 10.4, 10.4, 10.8, 11.2, 11, 10.3, 9.5, 8.1, 9.3,
10.2, 10.5, 11.2, 21.9, 18, 17.8, 11.3, 11.5, 10.8, 10.5, 12.8,
10.9, 8.9, 10.3, 10.8, 8.9, 10.9, 9.9, 19, 11.6, 11.3, 11.7,
10.9, 12.1, 11.3, 10.6)), class = "data.frame", row.names = c(NA,
-50L))
>
What I would like to do is search through the id column and replace the following mistakes
some of the values have a zero missing off the front, all of these would start with a 1 now instead which makes finding them easily. So basically anything that has a character length of 9 and starts with a 1 needs a 0 as the first character.
some of the values are less than 10 characters long, these need to be removed.
some have more than one leading 0 and these need to be removed.
df$Id <- gsub("^('?)(1.{8}')$", "\\10\\2", df$Id)
df[ !grepl("^'?(00|'$)", df$Id),]
# Id Weight
# 1 '0110171786' 11.5
# 2 '01103fbfd5' 11.3
# 3 '0700edf6dc' 11.3
# 4 '01103fad09' 10.6
# 5 '01103fc9bb' 10.6
# 9 '01103fb169' 11.3
# 10 '01103fd723' 18.9
# 11 '01103f9c34' 18.9
# 15 '01103fc088' 11.0
# 16 '01103fa6d8' 10.4
# 17 '01103fb374' 10.4
# 18 '01103fce8c' 10.8
# 19 '01103f955d' 11.2
# 20 '011016e633' 11.0
# 21 '01103fa0da' 10.3
# 25 '01103fa4bd' 10.2
# 26 '01103fb5c4' 10.5
# 27 '01103fd0d7' 11.2
# 28 '01103f9e2e' 21.9
# 29 '01103fc657' 18.0
# 30 '01103fd4d1' 17.8
# 31 '011016e78e' 11.3
# 32 '01103fbda2' 11.5
# 33 '01103fbae7' 10.8
# 34 '011016ee23' 10.5
# 35 '01103fc847' 12.8
# 36 '01103fbfbb' 10.9
# 38 '01103fb8bb' 10.3
# 39 '01103fc853' 10.8
# 41 '01103fbcd5' 10.9
# 42 '011016e690' 9.9
# 43 '01103fb253' 19.0
# 44 '01103fcb19' 11.6
# 45 '01103fb446' 11.3
# 46 '01103fa4fa' 11.7
# 47 '011016cfbd' 10.9
# 48 '01103fd250' 12.1
# 49 '01103fac7d' 11.3
# 50 '011016a86e' 10.6

Group columns in Rmarkdown

I am trying to produce a grouped table, something like this. Only can not figure out how it works if my groups are months which are consisted of weather variables, while rows are years.
This is how the data looks like:
year_var month temp rain rhum
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2007 4 11 16.4 73.5
2 2007 5 11.6 38.3 74.3
3 2007 6 14 108. 83.9
4 2007 7 14.4 129. 83.6
5 2007 8 14.9 104. 83.2
6 2007 9 13.5 36.8 82.7
7 2008 4 7.8 31.8 76.4
8 2008 5 12.7 37.1 76.6
9 2008 6 13.3 86.8 77.7
10 2008 7 15.2 137. 80.4
11 2008 8 15.3 142. 83.1
12 2008 9 12.4 81.2 84.9
Is there a way to turn the month column into grouping variable?
A sample of my data:
structure(list(year_var = c(2007, 2007, 2007, 2007, 2007, 2007,
2008, 2008, 2008, 2008, 2008, 2008), month = c(4, 5, 6, 7, 8,
9, 4, 5, 6, 7, 8, 9), temp = c(11, 11.6, 14, 14.4, 14.9, 13.5,
7.8, 12.7, 13.3, 15.2, 15.3, 12.4), rain = c(16.4, 38.3, 107.7,
129.3, 103.8, 36.8, 31.8, 37.1, 86.8, 136.7, 142.4, 81.2), rhum = c(73.5,
74.3, 83.9, 83.6, 83.2, 82.7, 76.4, 76.6, 77.7, 80.4, 83.1, 84.9
)), row.names = c(NA, -12L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = "year_var", drop = TRUE, indices = list(
0:5, 6:11), group_sizes = c(6L, 6L), biggest_group_size = 6L, labels = structure(list(
year_var = c(2007, 2008)), row.names = c(NA, -2L), class = "data.frame", vars = "year_var", drop = TRUE))

Resources