Related
I have some data which looks like:
# A tibble: 52 × 3
provincia mean_price number_properties
<chr> <dbl> <int>
1 A Coruña 179833. 2508
2 Albacete 148584. 1311
3 Alicante 418148. 22676
4 Almería 142338. 3902
5 Araba - Álava 243705. 786
I am trying to compute the ntiles of the data which I can do.
df %>%
mutate(
bins = ntile(mean_price, 5)
)
So, I want to bin the data into 5 groups. However, I also would like to extract the cutoff points that were used to create each bin. I know the quantiles function would get me what I wanted but I have to pass it 52 as an input parameter (number of observations I have)- However, I would like to have just 5 groups.
Code:
x = df %>%
mutate(
bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price, probs = seq(0, 1, length.out = 52), na.rm = TRUE)
)
How can I get the values for the cutoffs from the mean_price column for each ntile?
Data:
df <- structure(list(provincia = c("A Coruña", "Albacete", "Alicante",
"Almería", "Araba - Álava", "Asturias", "Badajoz", "Barcelona",
"Bizkaia", "Burgos", "Cantabria", "Castellón", "Ceuta", "Ciudad Real",
"Cuenca", "Cáceres", "Cádiz", "Córdoba", "Gipuzkoa", "Girona",
"Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears",
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo",
"Madrid", "Melilla", "Murcia", "Málaga", "Navarra", "Ourense",
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife",
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo",
"Valencia", "Valladolid", "Zamora", "Zaragoza", "Ávila"), mean_price = c(179833.167862839,
148583.87109077, 418148.151437643, 142337.792926704, 243704.750636132,
149179.732438607, 147802.894486692, 462813.775190776, 286461.604484305,
134635.540239044, 201214.437926878, 139819.15323646, 204318.181818182,
113856.401715511, 116992.998609179, 143137.441025641, 251270.65520481,
166586.746650426, 340663.440746753, 365733.360551724, 219325.194605466,
173221.569037657, 142430.992912371, 141754.603535354, 749037.540922619,
131396.292428198, 160838.235963581, 656553.777123633, 140312.919506463,
160485.155614973, 129787.667711599, 433665.652781242, 234306.12244898,
188635.237652749, 687127.622056842, 197388.050991501, 137829.640394089,
123450.853813559, 230344.604904632, 159999.046077894, 328912.419004007,
184829.536144578, 180692.511036468, 155759.235955056, 193651.956693536,
128909.006756757, 132623.629553967, 177825.529404212, 165240.297002725,
116820.125531915, 164934.932635983, 124977.724215247), number_properties = c(2508L,
1311L, 22676L, 3902L, 786L, 3502L, 2104L, 35906L, 3345L, 1255L,
2489L, 4542L, 44L, 1399L, 719L, 1170L, 5322L, 5747L, 1232L, 7250L,
16906L, 956L, 1552L, 396L, 5376L, 1915L, 1318L, 2378L, 1702L,
1870L, 638L, 16036L, 49L, 7856L, 15587L, 706L, 1421L, 472L, 2569L,
1823L, 3494L, 498L, 6252L, 89L, 6327L, 296L, 2937L, 15576L, 1468L,
470L, 2390L, 446L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-52L))
We can use quantile output within cut as mentioned in the comments
library(dplyr)
df %>%
mutate(bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price,
probs = seq(0, 1, length.out = 52), na.rm = TRUE),
cut_bins = cut(mean_price, breaks = bins_cutpoints) )
-output
# A tibble: 52 × 6
provincia mean_price number_properties bins bins_cutpoints cut_bins
<chr> <dbl> <int> <int> <dbl> <fct>
1 A Coruña 179833. 2508 30 113856. (1.778e+05,1.798e+05]
2 Albacete 148584. 1311 19 116820. (1.478e+05,1.486e+05]
3 Alicante 418148. 22676 47 116993. (3.657e+05,4.181e+05]
4 Almería 142338. 3902 15 123451. (1.418e+05,1.423e+05]
5 Araba - Álava 243705. 786 41 124978. (2.343e+05,2.437e+05]
6 Asturias 149180. 3502 20 128909. (1.486e+05,1.492e+05]
7 Badajoz 147803. 2104 18 129788. (1.431e+05,1.478e+05]
8 Barcelona 462814. 35906 49 131396. (4.337e+05,4.628e+05]
9 Bizkaia 286462. 3345 43 132624. (2.513e+05,2.865e+05]
10 Burgos 134636. 1255 10 134636. (1.326e+05,1.346e+05]
# … with 42 more rows
Compute the quintiles then use cut:
x <- quantile(df$mean_price, seq(0, 1, 0.2))
# 0% 20% 40% 60% 80% 100%
# 113856.4 138227.5 157455.2 183174.7 249757.5 749037.5
df <- df %>%
mutate(
bins = ntile(mean_price, 5),
grp = cut(mean_price, breaks = x))
table(df$bins)
# 1 2 3 4 5
# 11 11 10 10 10
table(df$grp)
# (1.14e+05,1.38e+05] (1.38e+05,1.57e+05] (1.57e+05,1.83e+05] (1.83e+05,2.5e+05]
# 10 10 10 10
# (2.5e+05,7.49e+05]
# 11
This is how my data looks like with columns matching a gen ID and a timepoint (_01 or _03)
id sample_name placa_ppard_01 placa_pparg_01 placa_nr1h3_01
1 50109018 A2535500018000006001 17.17521 24.10186 22.05157
2 50109019 A2535500019000006001 17.17521 24.10186 22.05157
3 50109025 A2535500025000006001 17.17521 24.10186 22.05157
4 50109026 A2535500026000006001 17.17521 24.10186 22.05157
5 50109027 A2535500027000006001 17.33383 24.38065 21.98426
6 50118001 A2536300001000006001 17.17521 24.10186 22.05157
placa_nr1h2_01 placa_ldlr_03 placa_pcsk9_03 placa_olr1_03 placa_msr1_03
1 18.41557 18.89107 NA 23.93779 20.58086
2 18.41557 18.89107 NA 23.93779 20.58086
3 18.41557 18.89107 NA 23.93779 20.58086
4 18.41557 18.89107 NA 23.93779 20.58086
5 18.37935 19.12857 NA 23.22024 20.36100
6 18.41557 18.89107 NA 23.93779 20.58086
### a small piece of data
df <- structure(list(id = structure(c(130102009, 50203017, 50508027,
140102087, 70201047, 50109027, 130102012, 120715030, 130106034,
120715037, 60901020, 60901023, 140102092, 50203030, 50203004), format.spss = "F10.0", display_width = 10L),
placa_rxrb_01 = structure(c(17.3644166666667, 17.4306458333333,
17.2125, 17.4306458333333, 16.8459791666667, 17.3644166666667,
17.4306458333333, 17.3336875, 17.4306458333333, 17.4459166666667,
17.3644166666667, 17.3644166666667, 17.3336875, 17.2125,
17.4306458333333), label = "lab_exp_genica: ct_cal_placa_RXRB Hs00232774_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cyp27a1_01 = structure(c(18.1048043478261, 18.0184893617021,
17.9057708333333, 18.0184893617021, 17.6525625, 18.1048043478261,
18.0184893617021, 17.7276875, 18.0184893617021, 17.5819565217391,
18.1048043478261, 18.1048043478261, 17.7276875, 17.9057708333333,
18.0184893617021), label = "lab_exp_genica: ct_cal_placa_CYP27A1 Hs01017992_g1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_abca1_01 = structure(c(18.0373958333333, 17.7479791666667,
17.9839791666667, 17.7479791666667, 17.7181041666667, 18.0373958333333,
17.7479791666667, 17.8329583333333, 17.7479791666667, 17.9318260869565,
18.0373958333333, 18.0373958333333, 17.8329583333333, 17.9839791666667,
17.7479791666667), label = "lab_exp_genica: ct_cal_placa_ABCA1 Hs01059101_m1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_scarb1_01 = structure(c(NA, 20.2697659574468, 20.5369583333333,
20.2697659574468, 20.1755208333333, 20.6629111111111, 20.2697659574468,
20.471829787234, 20.2697659574468, 20.2979166666667, 20.6629111111111,
20.6629111111111, 20.471829787234, 20.5369583333333, 20.2697659574468
), label = "lab_exp_genica: ct_cal_placa_SCARB1 Hs00969821_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cav1_01 = structure(c(23.9575208333333, 23.5285625,
24.01425, 23.5285625, 23.1588541666667, 23.9575208333333,
23.5285625, 23.5898125, 23.5285625, 23.513875, 23.9575208333333,
23.9575208333333, 23.5898125, 24.01425, 23.5285625), label = "lab_exp_genica: ct_cal_placa_CAV1 Hs00971716_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_nfkb1_01 = structure(c(16.8749583333333, 16.6195, 16.7741041666667,
16.6195, 16.1903125, 16.8749583333333, 16.6195, 16.608375,
16.6195, NA, 16.8749583333333, 16.8749583333333, 16.608375,
16.7741041666667, 16.6195), label = "lab_exp_genica: ct_cal_placa_NFKB1 Hs00765730_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_tgfb2_03 = structure(c(24.0563043478261, 23.6882553191489,
24.1700416666667, 23.6882553191489, 23.4935416666667, 24.0563043478261,
23.6882553191489, 24.02575, 23.6882553191489, 23.7865208333333,
24.0563043478261, 24.0563043478261, 24.02575, 24.1700416666667,
23.6882553191489), label = "lab_exp_genica: ct_cal_placa_TGFB2 Hs00234244_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_ido_03 = structure(c(19.5457916666667, 19.4714375,
19.6080212765957, 19.4714375, 18.3972708333333, 19.5457916666667,
19.4714375, 19.4884166666667, 19.4714375, 19.070447368421,
19.5457916666667, 19.5457916666667, 19.4884166666667, 19.6080212765957,
19.4714375), label = "lab_exp_genica: ct_cal_placa_IDO Hs00984148_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_cdkn2a_03 = structure(c(21.1584583333333, 21.0697291666667,
21.1403541666667, 21.0697291666667, 20.7684680851064, 21.1584583333333,
21.0697291666667, 21.1548125, 21.0697291666667, 21.0899189189189,
21.1584583333333, 21.1584583333333, 21.1548125, 21.1403541666667,
21.0697291666667), label = "lab_exp_genica: ct_cal_placa_CDKN2A Hs00923894_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_abcg1_03 = structure(c(18.7376808510638, 18.3815957446809,
18.6998333333333, 18.3815957446809, 18.1750833333333, 18.7376808510638,
18.3815957446809, 18.5114893617021, 18.3815957446809, 18.3424375,
18.7376808510638, 18.7376808510638, 18.5114893617021, 18.6998333333333,
18.3815957446809), label = "lab_exp_genica: ct_cal_placa_ABCG1 Hs00245154_m1 Ct 1 year", format.spss = "F10.7", display_width = 10L),
placa_abcg4_03 = structure(c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, 27.677, NA, NA, NA, NA, NA), label = "lab_exp_genica: ct_cal_placa_ABCG4 Hs00223446_m1 Ct 1 year", format.spss = "F6.3"),
placa_nfe2l2_03 = structure(c(17.2223404255319, 16.8399787234043,
17.1038541666667, 16.8399787234043, 16.5560208333333, 17.2223404255319,
16.8399787234043, 16.9331063829787, 16.8399787234043, 17.1034166666667,
17.2223404255319, 17.2223404255319, 16.9331063829787, 17.1038541666667,
16.8399787234043), label = "lab_exp_genica: ct_cal_placa_NFE2L2 Hs00975961_g1 Ct 1 year", format.spss = "F10.7", display_width = 10L)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
As you can see the pattern is placa_gen_time. I want something like this:
gen_cal cal01 cal03
ppard 17.17521 NA
pparg 24.10186 NA
olr1 NA 23.93779
msr1 NA 20.58086
I guess it could be done in more than one step with sthg like this (genes2 my database). But I am pretty sure that pivot_longer function allows to transform the colnames into 2 columns for values and one column for the name (=genes)
genes2 <- genes2 %>% pivot_longer(cols = matches("placa_"), names_to = c("gen_cal", "time_cal"), names_sep = "_0",values_to = "Ct_cal")
> head
gen_cal time_cal Ct_cal
<chr> <chr> <dbl>
1 ppara 1 19.7
2 ppard 1 17.2
3 pparg 1 24.1
4 nr1h3 1 22.1
5 nr1h2 1 18.4
6 rxra 1 14.3
# And then pivot_wider the column time and Ct_cal into cal01 and cal03
We need to reshape twice, wide-to-long, split string, then reshape it again long-to-wide, using data.table:
library(data.table)
setDT(df)
dcast(
melt(df, id.vars = "id")[, c("gene", "time") := tstrsplit(variable, split = "_", fixed = TRUE, keep = 2:3)],
id + gene ~ time)
# id gene 01 03
# 1: 50109027 abca1 18.03740 NA
# 2: 50109027 abcg1 NA 18.73768
# 3: 50109027 abcg4 NA NA
# 4: 50109027 cav1 23.95752 NA
# 5: 50109027 cdkn2a NA 21.15846
# ---
# 176: 140102092 nfe2l2 NA 16.93311
# 177: 140102092 nfkb1 16.60837 NA
# 178: 140102092 rxrb 17.33369 NA
# 179: 140102092 scarb1 20.47183 NA
# 180: 140102092 tgfb2 NA 24.02575
Seems like you are looking for .value in the names_to argument.
library(tidyverse)
df %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)") %>%
setNames(c("id", "gene", "cal01", "cal03"))
Update
If we do not use setNames after the above pivot_longer, the pivoted column names would only contain numbers, which is not allowed in base R data frame (no problem for tibble).
# A tibble: 180 × 4
id gene `01` `03`
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
A workaround for the column names is to rename them prior to pivot_longer to attach the string "cal" before the "0".
df %>%
rename_with(.cols = contains("_0"), ~sub("_0", "_cal0", .x)) %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)")
Output
# A tibble: 180 × 4
id gene cal01 cal03
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
2 130102009 cyp27a1 18.1 NA
3 130102009 abca1 18.0 NA
4 130102009 scarb1 NA NA
5 130102009 cav1 24.0 NA
6 130102009 nfkb1 16.9 NA
7 130102009 tgfb2 NA 24.1
8 130102009 ido NA 19.5
9 130102009 cdkn2a NA 21.2
10 130102009 abcg1 NA 18.7
# … with 170 more rows
# ℹ Use `print(n = ...)` to see more rows
library(tidyverse)
library(sf)
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
I have some data where I would like to calculate the distances between each point (station) along defined paths.
dat <-
structure(
list(
name = c(
"Untitled Path",
"St34B",
"St35N",
"St36F",
"St37N",
"St38B",
"Untitled Path",
"St39N"
),
description = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
timestamp = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
begin = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
end = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
altitude_mode = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
tessellate = c(
1L, -1L, -1L, -1L,
-1L, -1L, 1L, -1L
),
extrude = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
visibility = c(-1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L),
draw_order = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
icon = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
geometry = structure(
list(
structure(
c(
-213231.809501996,
-205487.607705256,
-784028.913066238,
-708301.049327739
),
.Dim = c(
2L,
2L
),
class = c("XY", "LINESTRING", "sfg")
),
structure(
c(
-213529.323058115,
-785232.982945769
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-212176.423266777,
-773238.391709674
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-210268.431741568,
-756818.73172344
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-208050.517190725,
-737973.862632309
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-206040.836893304,
-709783.744787448
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-204426.676405507,
-160265.400475699,
-708310.127055397,
-727750.877479657
),
.Dim = c(
2L,
2L
),
class = c("XY", "LINESTRING", "sfg")
),
structure(
c(
-179260.597288432,
-718361.477655825
),
class = c("XY", "POINT", "sfg")
)
),
n_empty = 0L,
crs = structure(
list(input = "EPSG:3411", wkt = "PROJCRS[\"NSIDC Sea Ice Polar Stereographic North\",\n BASEGEOGCRS[\"Unspecified datum based upon the Hughes 1980 ellipsoid\",\n DATUM[\"Not specified (based on Hughes 1980 ellipsoid)\",\n ELLIPSOID[\"Hughes 1980\",6378273,298.279411123064,\n LENGTHUNIT[\"metre\",1]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ID[\"EPSG\",4054]],\n CONVERSION[\"US NSIDC Sea Ice polar stereographic north\",\n METHOD[\"Polar Stereographic (variant B)\",\n ID[\"EPSG\",9829]],\n PARAMETER[\"Latitude of standard parallel\",70,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8832]],\n PARAMETER[\"Longitude of origin\",-45,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8833]],\n PARAMETER[\"False easting\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8806]],\n PARAMETER[\"False northing\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8807]]],\n CS[Cartesian,2],\n AXIS[\"easting (X)\",south,\n MERIDIAN[45,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ORDER[1],\n LENGTHUNIT[\"metre\",1]],\n AXIS[\"northing (Y)\",south,\n MERIDIAN[135,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ORDER[2],\n LENGTHUNIT[\"metre\",1]],\n USAGE[\n SCOPE[\"unknown\"],\n AREA[\"World - N hemisphere - north of 60°N\"],\n BBOX[60,-180,90,180]],\n ID[\"EPSG\",3411]]"),
class = "crs"
),
class = c(
"sfc_GEOMETRY",
"sfc"
),
precision = 0,
bbox = structure(
c(
xmin = -213529.323058115,
ymin = -785232.982945769,
xmax = -160265.400475699,
ymax = -708301.049327739
),
class = "bbox"
),
classes = c(
"LINESTRING",
"POINT",
"POINT",
"POINT",
"POINT",
"POINT",
"LINESTRING",
"POINT"
)
)
),
row.names = c(
NA,
8L
),
sf_column = "geometry",
agr = structure(
c(
name = NA_integer_,
description = NA_integer_,
timestamp = NA_integer_,
begin = NA_integer_,
end = NA_integer_,
altitude_mode = NA_integer_,
tessellate = NA_integer_,
extrude = NA_integer_,
visibility = NA_integer_,
draw_order = NA_integer_,
icon = NA_integer_
),
class = "factor",
.Label = c(
"constant",
"aggregate", "identity"
)
),
class = c("sf", "data.frame")
)
dat
#> Simple feature collection with 8 features and 11 fields
#> Geometry type: GEOMETRY
#> Dimension: XY
#> Bounding box: xmin: -213529.3 ymin: -785233 xmax: -160265.4 ymax: -708301
#> Projected CRS: NSIDC Sea Ice Polar Stereographic North
#> name description timestamp begin end altitude_mode tessellate
#> 1 Untitled Path <NA> <NA> <NA> <NA> <NA> 1
#> 2 St34B <NA> <NA> <NA> <NA> <NA> -1
#> 3 St35N <NA> <NA> <NA> <NA> <NA> -1
#> 4 St36F <NA> <NA> <NA> <NA> <NA> -1
#> 5 St37N <NA> <NA> <NA> <NA> <NA> -1
#> 6 St38B <NA> <NA> <NA> <NA> <NA> -1
#> 7 Untitled Path <NA> <NA> <NA> <NA> <NA> 1
#> 8 St39N <NA> <NA> <NA> <NA> <NA> -1
#> extrude visibility draw_order icon geometry
#> 1 0 -1 NA <NA> LINESTRING (-213231.8 -7840...
#> 2 0 -1 NA <NA> POINT (-213529.3 -785233)
#> 3 0 -1 NA <NA> POINT (-212176.4 -773238.4)
#> 4 0 -1 NA <NA> POINT (-210268.4 -756818.7)
#> 5 0 -1 NA <NA> POINT (-208050.5 -737973.9)
#> 6 0 -1 NA <NA> POINT (-206040.8 -709783.7)
#> 7 0 -1 NA <NA> LINESTRING (-204426.7 -7083...
#> 8 0 -1 NA <NA> POINT (-179260.6 -718361.5)
ggplot() +
geom_sf(data = dat) +
geom_sf_text(
data = dat,
aes(label = name),
size = 3,
hjust = 0
)
I would like to calculate the distance between stations 34 - 35 - … - 39
but along the path (station numbers determine the order).The first problems
I see is that the lines (paths) are not connected and the stations are not
connected to the lines.
I first tried to extract the paths and the stations:
stations <- dat %>%
filter(str_starts(name, "St"))
paths <- dat %>%
filter(str_starts(name, "Untitled"))
ggplot() +
geom_sf(data = paths, color = "red") +
geom_sf(data = stations, color = "blue") +
geom_sf_text(
data = stations,
aes(label = name),
color = "blue",
size = 3,
hjust = 0
)
I am stuck on the next steps. I first tried to merge the lines and then
snap the points to the closest line using st_snap() without success. Any
help is appreciated.
Created on 2021-12-01 by the reprex package (v2.0.1)
Please find a detailed reprex that provides a solution to your request using the sf, sfnetworks, units, dplyr and ggplot2 libraries.
Reprex
STEP 1: Create a 'sfnetworks' object only based 'on connected lines(i.e.edges)
library(sf)
library(units)
library(sfnetworks)
options(sfn_max_print_active = 15, sfn_max_print_inactive = 15)
library(dplyr)
library(ggplot2)
network <- dat %>%
filter(st_geometry_type(.) == "LINESTRING") %>% # selects only the lines from 'sf' object 'dat'
st_snap(.,., tolerance = 10000) %>% # coerces the snapping using a big tolerance value!
as_sfnetwork() # creates the network
autoplot(network)
STEP 2: Create a 'sf' object with only points (i.e. nodes)
nodes <- dat %>%
filter(st_geometry_type(.) == "POINT")
nodes
#> Simple feature collection with 6 features and 11 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: -213529.3 ymin: -785233 xmax: -179260.6 ymax: -709783.7
#> Projected CRS: NSIDC Sea Ice Polar Stereographic North
#> name description timestamp begin end altitude_mode tessellate extrude
#> 1 St34B <NA> <NA> <NA> <NA> <NA> -1 0
#> 2 St35N <NA> <NA> <NA> <NA> <NA> -1 0
#> 3 St36F <NA> <NA> <NA> <NA> <NA> -1 0
#> 4 St37N <NA> <NA> <NA> <NA> <NA> -1 0
#> 5 St38B <NA> <NA> <NA> <NA> <NA> -1 0
#> 6 St39N <NA> <NA> <NA> <NA> <NA> -1 0
#> visibility draw_order icon geometry
#> 1 -1 NA <NA> POINT (-213529.3 -785233)
#> 2 -1 NA <NA> POINT (-212176.4 -773238.4)
#> 3 -1 NA <NA> POINT (-210268.4 -756818.7)
#> 4 -1 NA <NA> POINT (-208050.5 -737973.9)
#> 5 -1 NA <NA> POINT (-206040.8 -709783.7)
#> 6 -1 NA <NA> POINT (-179260.6 -718361.5)
STEP 3: Add the nodes of the 'sf' object into the 'network'
1. Code
new_network <- network %>%
st_network_blend(., nodes, tolerance = 10000) %>% # snap the nodes on the network based on the given tolerance
filter(.,!is.na(name)) %>% # keeps only the nodes from the 'sf' object 'nodes'
st_as_sf %>% # convert into sf object (mandatory step for the next one to work properly)
as_sfnetwork(., edges_as_lines = TRUE) # reconstructs the network only with the nodes from the 'sf' object 'nodes'
#> Warning: st_network_blend assumes attributes are constant over geometries
2. Specifications of the network
new_network
#> # A sfnetwork with 6 nodes and 5 edges
#> #
#> # CRS: EPSG:3411
#> #
#> # A rooted tree with spatially explicit edges
#> #
#> # Node Data: 6 x 12 (active)
#> # Geometry type: POINT
#> # Dimension: XY
#> # Bounding box: xmin: -213231.8 ymin: -784028.9 xmax: -179639.4 ymax:
#> # -709824.4
#> name description timestamp begin end
#> <chr> <chr> <dttm> <dttm> <dttm>
#> 1 St34B <NA> NA NA NA
#> 2 St35N <NA> NA NA NA
#> 3 St36F <NA> NA NA NA
#> 4 St37N <NA> NA NA NA
#> 5 St38B <NA> NA NA NA
#> 6 St39N <NA> NA NA NA
#> # ... with 7 more variables: altitude_mode <chr>, tessellate <int>,
#> # extrude <int>, visibility <int>, draw_order <int>, icon <chr>,
#> # geometry <POINT [m]>
#> #
#> # Edge Data: 5 x 3
#> # Geometry type: LINESTRING
#> # Dimension: XY
#> # Bounding box: xmin: -213231.8 ymin: -784028.9 xmax: -179639.4 ymax:
#> # -709824.4
#> from to geometry
#> <int> <int> <LINESTRING [m]>
#> 1 1 2 (-213231.8 -784028.9, -212128.8 -773243.3)
#> 2 2 3 (-212128.8 -773243.3, -210447.3 -756800.4)
#> 3 3 4 (-210447.3 -756800.4, -208517.2 -737926.1)
#> 4 4 5 (-208517.2 -737926.1, -205643.4 -709824.4)
#> 5 5 6 (-205643.4 -709824.4, -179639.4 -719222)
3. Visualization of the network
# option 1 with autoplot:
autoplot(new_network) +
geom_sf_text(
data = st_as_sf(new_network),
aes(label = name),
size = 3,
hjust = 0
)
# if you prefer, option 2 with only ggplot:
ggplot() +
geom_sf(data = st_as_sf(new_network, "edges"), col = "grey50") +
geom_sf(data = st_as_sf(new_network, "nodes")) +
geom_sf_text(
data = st_as_sf(new_network),
aes(label = name),
size = 3,
hjust = 0
)
STEP 4: Computes the length of edges between each node along the network and creates the dataframe distances (i.e. tibble class)
distances <- new_network %>%
activate("edges") %>%
mutate(length = set_units(edge_length(),km)) %>%
st_as_sf() %>%
st_drop_geometry
distances
#> # A tibble: 5 x 3
#> from to length
#> * <int> <int> [km]
#> 1 1 2 10.8
#> 2 2 3 16.5
#> 3 3 4 19.0
#> 4 4 5 28.2
#> 5 5 6 27.6
STEP 5: Replace ids of columns "from" and "to" of the distances dataframe by the names of nodes
1. Extract names of nodes and map them to the id's of distances dataframe
names_id <- new_network %>%
activate("nodes") %>%
st_as_sf() %>%
mutate(ID = seq(name)) %>%
select(., c("ID", "name")) %>%
st_drop_geometry
names_id
#> # A tibble: 6 x 2
#> ID name
#> * <int> <chr>
#> 1 1 St34B
#> 2 2 St35N
#> 3 3 St36F
#> 4 4 St37N
#> 5 5 St38B
#> 6 6 St39N
2. Modify the dataframe distances to get the names of nodes in 'from' and 'to' columns using two left_join()
distances <- left_join(distances, names_id, by = c("from" = "ID")) %>%
mutate(from = name) %>%
select(-name) %>%
left_join(., names_id, by = c("to" = "ID")) %>%
mutate(to = name) %>%
select(-name)
3. Final output
distances
#> # A tibble: 5 x 3
#> from to length
#> <chr> <chr> [km]
#> 1 St34B St35N 10.8
#> 2 St35N St36F 16.5
#> 3 St36F St37N 19.0
#> 4 St37N St38B 28.2
#> 5 St38B St39N 27.6
Created on 2021-12-06 by the reprex package (v2.0.1)
I want to calculate the weighted variance using the weights provided in the dataset, while group for the countries and cities, however the function returns NAs:
library(Hmisc) #for the 'wtd.var' function
weather_winter.std<-weather_winter %>%
group_by(country, capital_city) %>%
summarise(across(starts_with("winter"),wtd.var))
The provided output from the console (when in long format):
# A tibble: 35 x 3
# Groups: country [35]
country capital_city winter
<chr> <chr> <dbl>
1 ALBANIA Tirane NA
2 AUSTRIA Vienna NA
3 BELGIUM Brussels NA
4 BULGARIA Sofia NA
5 CROATIA Zagreb NA
6 CYPRUS Nicosia NA
7 CZECHIA Prague NA
8 DENMARK Copenhagen NA
9 ESTONIA Tallinn NA
10 FINLAND Helsinki NA
# … with 25 more rows
This is the code that I used to get the data from a wide format into a long format:
weather_winter <- weather_winter %>% pivot_longer(-c(31:33))
weather_winter$name <- NULL
names(weather_winter)[4] <- "winter"
Some example data:
structure(list(`dec-wet_2011` = c(12.6199998855591, 12.6099996566772,
14.75, 11.6899995803833, 18.2899990081787), `dec-wet_2012` = c(13.6300001144409,
14.2199993133545, 14.2299995422363, 16.1000003814697, 18.0299987792969
), `dec-wet_2013` = c(4.67999982833862, 5.17000007629395, 4.86999988555908,
7.56999969482422, 5.96000003814697), `dec-wet_2014` = c(14.2999992370605,
14.4799995422363, 13.9799995422363, 15.1499996185303, 16.1599998474121
), `dec-wet_2015` = c(0.429999977350235, 0.329999983310699, 1.92999994754791,
3.30999994277954, 7.42999982833862), `dec-wet_2016` = c(1.75,
1.29999995231628, 3.25999999046326, 6.60999965667725, 8.67999935150146
), `dec-wet_2017` = c(13.3400001525879, 13.3499994277954, 15.960000038147,
10.6599998474121, 14.4699993133545), `dec-wet_2018` = c(12.210000038147,
12.4399995803833, 11.1799993515015, 10.75, 18.6299991607666),
`dec-wet_2019` = c(12.7199993133545, 13.3800001144409, 13.9899997711182,
10.5299997329712, 12.3099994659424), `dec-wet_2020` = c(15.539999961853,
16.5200004577637, 11.1799993515015, 14.7299995422363, 13.5499992370605
), `jan-wet_2011` = c(8.01999950408936, 7.83999967575073,
10.2199993133545, 13.8899993896484, 14.5299997329712), `jan-wet_2012` = c(11.5999994277954,
11.1300001144409, 12.5500001907349, 10.1700000762939, 22.6199989318848
), `jan-wet_2013` = c(17.5, 17.4099998474121, 15.5599994659424,
13.3199996948242, 20.9099998474121), `jan-wet_2014` = c(12.5099992752075,
12.2299995422363, 15.210000038147, 9.73999977111816, 9.63000011444092
), `jan-wet_2015` = c(17.6900005340576, 16.9799995422363,
11.75, 9.9399995803833, 19), `jan-wet_2016` = c(15.6099996566772,
15.5, 14.5099992752075, 10.3899993896484, 18.4499988555908
), `jan-wet_2017` = c(9.17000007629395, 9.61999988555908,
9.30999946594238, 15.8499994277954, 11.210000038147), `jan-wet_2018` = c(8.55999946594238,
9.10999965667725, 13.2599992752075, 9.85999965667725, 15.8899993896484
), `jan-wet_2019` = c(17.0699996948242, 16.8699989318848,
14.5699996948242, 19.0100002288818, 19.4699993133545), `jan-wet_2020` = c(6.75999975204468,
6.25999975204468, 6.00999975204468, 5.35999965667725, 8.15999984741211
), `feb-wet_2011` = c(9.1899995803833, 8.63999938964844,
6.21999979019165, 9.82999992370605, 4.67999982833862), `feb-wet_2012` = c(12.2699995040894,
11.6899995803833, 8.27999973297119, 14.9399995803833, 13.0499992370605
), `feb-wet_2013` = c(15.3599996566772, 15.9099998474121,
17.0599994659424, 13.3599996566772, 16.75), `feb-wet_2014` = c(10.1999998092651,
11.1399993896484, 13.8599996566772, 10.7399997711182, 7.35999965667725
), `feb-wet_2015` = c(11.9200000762939, 12.2699995040894,
8.01000022888184, 14.5299997329712, 5.71999979019165), `feb-wet_2016` = c(14.6999998092651,
14.7799997329712, 16.7899990081787, 4.90000009536743, 19.3500003814697
), `feb-wet_2017` = c(8.98999977111816, 9.17999935150146,
11.7699995040894, 6.3899998664856, 13.9899997711182), `feb-wet_2018` = c(16.75,
16.8599987030029, 12.0599994659424, 16.1900005340576, 8.51000022888184
), `feb-wet_2019` = c(7.58999967575073, 7.26999998092651,
8.21000003814697, 7.57999992370605, 8.81999969482422), `feb-wet_2020` = c(10.6399993896484,
10.4399995803833, 13.4399995803833, 8.53999996185303, 19.939998626709
), country = c("SERBIA", "SERBIA", "SLOVENIA", "GREECE",
"CZECHIA"), capital_city = c("Belgrade", "Belgrade", "Ljubljana",
"Athens", "Prague"), weight = c(20.25, 19.75, 14.25, 23.75,
14.25)), row.names = c(76L, 75L, 83L, 16L, 5L), class = "data.frame")
Your code seems to provide the right answer, now there's more data:
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 27.2
2 GREECE Athens 14.6
3 SERBIA Belgrade 19.1
4 SLOVENIA Ljubljana 16.3
Is this what you were looking for?
I took the liberty of streamlining your code:
weather_winter <- weather_winter %>%
pivot_longer(-c(31:33), values_to = "winter") %>%
select(-name)
weather_winter.std <- weather_winter %>%
group_by(country, capital_city) %>%
summarise(winter = wtd.var(winter))
With only one "winter" column, there's no need for the across().
Finally, you are not using the weights. If these are needed, then change the last line to:
summarise(winter = wtd.var(winter, weights = weight))
To give:
# A tibble: 4 x 3
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 26.3
2 GREECE Athens 14.2
3 SERBIA Belgrade 18.8
4 SLOVENIA Ljubljana 15.8
I have two dataframes in which I would like to merge based on country code.
For example my data looks as follows:
x = data.frame("countryCode" = c("AD", "AE", "AF", "AT", "BA"), "lang" = c("Catalan", "Arabic", "Dari", "German", "Romani"), "langCountryPop" = c(31000, 744000, 5600000, 7500000, 400000), "lat" = c(41.75,26,35,52, 45), "lon" = c(2,49,66,10,21))
y = data.frame("iso2_x" = c("AE", "AT", "BA"), "iso2_y" = c("AD", "AF", "AE"), "distance" = c(1243213, 1234123, 45423535))
I would like to merge X with Y so that Y now looks like:
y = data.frame("iso2_x" = c("AE", "AT", "BA"), "lang" = c("Arabic", "German", "Romani"), "lat" = c(26,52,45), "lon" = c(49,10,21), "iso2_y" = c("AD", "AF", "AE"), "lang" = c("Catalan", "Dari", "Arabic"), "lat" = c(41.75,35,26), "lon" = c(2,66,49), "distance" = c(1243213, 1234123, 45423535))
How can I match the "countryCode" column from X to the "iso2_x" and "iso2_y" columns in Y and add the respective columns from X to Y as well? Thanks
try it this way
library(tidyverse)
y %>%
pivot_longer(-distance, names_sep = "_", names_to = c(".value", "set")) %>%
left_join(x, by = c("iso2" = "countryCode")) %>%
pivot_wider(distance,
names_from = set,
values_from = c(iso2, lang, lat, lon)) %>%
select(ends_with("_x"), ends_with("_y"), distance)
# A tibble: 3 x 9
iso2_x lang_x lat_x lon_x iso2_y lang_y lat_y lon_y distance
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 AE Arabic 26 49 AD Catalan 41.8 2 1243213
2 AT German 52 10 AF Dari 35 66 1234123
3 BA Romani 45 21 AE Arabic 26 49 45423535
You could use just two joins...
x <- data.frame("countryCode" = c("AD", "AE", "AF", "AT", "BA"), "lang" = c("Catalan", "Arabic", "Dari", "German", "Romani"), "langCountryPop" = c(31000, 744000, 5600000, 7500000, 400000), "lat" = c(41.75,26,35,52, 45), "lon" = c(2,49,66,10,21))
y <- data.frame("iso2_x" = c("AE", "AT", "BA"), "iso2_y" = c("AD", "AF", "AE"), "distance" = c(1243213, 1234123, 45423535))
library(dplyr)
y %>%
left_join(x, by = c('iso2_x' = 'countryCode')) %>%
left_join(x, by = c('iso2_y' = 'countryCode'), suffix = c('', '_y'))
#> iso2_x iso2_y distance lang langCountryPop lat lon lang_y langCountryPop_y
#> 1 AE AD 1243213 Arabic 744000 26 49 Catalan 31000
#> 2 AT AF 1234123 German 7500000 52 10 Dari 5600000
#> 3 BA AE 45423535 Romani 400000 45 21 Arabic 744000
#> lat_y lon_y
#> 1 41.75 2
#> 2 35.00 66
#> 3 26.00 49