Variable lengths differ error when using lm in R - r

I am trying to run a linear model on time and a monthly factor, however I am getting the error:
Error in model.frame.default(formula = ts.data ~ time2 + factor(month2), :
variable lengths differ (found for 'time2')
This is how I created the variables:
time2<-seq(along=ts.data)
month2<-rep(1:12,length=length(ts.data))
However running length(time2), length(month2) and length(ts.data) gives the same number, does anyone know how I fix the error?
Trying to run lm :
lm(ts.data~time2+factor(month2))
The data I am using:
structure(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 78238, 73928, 70708, 75175, 70744, 65604, 61227, 62635,
47652, 51507, 81874, 98236, 99401, 94756, 94697, 93732, 100334,
139355, 88575, 94169, 86084, 98249, 95321, 87822, 80256, 81875,
86293, 80712, 79533, 82847, 84498, 84185, 78382, 82701, 80491,
91140, 86847, 96727, 101295, 99450, 87783, 101246, 97913, 100081,
96346, 93608, 90648, 99105, 90920, 84960, 82591, 88090, 89980,
87778, 87429, 81898, 77285, 80369, 73193, 65139, 60126, 57219,
94204, 112472, 157199, 154791.5, 154294.88, 161920.63, 147408.75,
134418, 132158.5, 104572.5, 96831, 91045.88, 141182.63, 214759.25,
216647, 184598.38, 210794.38, 182403.75, 193001.63, 176807.38,
186552.63, 201375.88, 181861.25, 193234.88, 187240.25, 168242,
172475.13, 188996.25, 179663.88, 192861.63, 187461.25, 188670.5,
198826.25, 208696.5, 180490.75, 202265.88, 187966.13, 203342.13,
194850.38, 230582.63, 212517, 223432.5, 196511.63, 229582.25,
206120.63, 225629.88, 209769.63, 210797.63, 213215.75, 215144.88,
223266, 230747.63, 228573, 223828.88, 202102.88, 192863.63, 206675.13,
195647.5, 173897.25, 183788.88, 158511.38, 138559, 114163.25,
110399.13, 164751.13, 270772, 90430, 81719, 79183, 85428, 79372,
72361, 66207, 55403, 51693, 60280, 98698, 123059, 121550, 107662,
107863, 107630, 114685, 169659, 100104, 107598, 97728, 112850,
107784, 97580, 92709, 99098, 99482, 100543, 98856, 106081, 108248,
104769, 96966, 100093, 103107, 114944, 108001, 126289, 135213,
129717, 121688, 134421, 127318, 127412, 121922, 119045, 116989,
126286, 116707, 106627, 98219, 111225, 117279, 113725, 114633,
100633, 95478, 98394, 87616, 75329, 68274, 70658, 122995, 145224,
155833, 131896.5, 138340.63, 145610, 130653.13, 122562.75, 115850.5,
91749.88, 81787.13, 85457.5, 142931.63, 214970, 216836.63, 175902,
180757.88, 175233.63, 168982.13, 168727.25, 173501, 182731.38,
152260.63, 182607, 179326.5, 157693.13, 161004.75, 172990.5,
166204.38, 175172.63, 186446.13, 202645.38, 202500.25, 204148.38,
187763.5, 207269.75, 183334.88, 206552.5, 207270.13, 226123.88,
239037.88, 214656.38, 216552.75, 231406.75, 207365.63, 217873.5,
200308.88, 201696.5, 208984.75, 227723.38, 212083.25, 206262.38,
186596.25, 215496.63, 199399, 184933.25, 195925.63, 190318.63,
170375.38, 171624.13, 154537.13, 133532.25, 119179.13, 113297.88,
174946.5, 304690, 108567, 99358, 97299, 103628, 96936, 89254,
83761, 72058, 66685, 74491, 117292, 139878, 139585, 130180, 130079,
127562, 136152, 197149, 118619, 127875, 118094, 134989, 130688,
121475, 112367, 114805, 117087, 118526, 118038, 123988, 127511,
125790, 116702, 123049, 124260, 141232, 133809, 156349, 162637,
158367, 144491, 164389, 155305, 161401, 151829, 144188, 142702,
156405, 141937, 129857, 120318, 132823, 138201, 135058, 129275,
119897, 112924, 120385, 108134, 94062, 86695, 88434, 145426,
167100, 184196.63, 166628.38, 168193.13, 190280.88, 154984.75,
153784.38, 148033.75, 121304.25, 107303.25, 108003.13, 168770,
240983.75, 242817.13, 220229.38, 222805.75, 205068.75, 205204.25,
192598.25, 206565.38, 227284.88, 199258.25, 227122.88, 209076.13,
194855.5, 196357.25, 206865.88, 209580.13, 222190.88, 234610.88,
229339.13, 219321.63, 232571.75, 218584.75, 246116.38, 229563,
256776.75, 257335.25, 271507, 272014, 265850.5, 253426.63, 291759.63,
262608.88, 279417.25, 264583.25, 256634.88, 271024.88, 283927.13,
270597.38, 264222.5, 235009.13, 258379.25, 246485.5, 240163.25,
238369.88, 240961.5, 219826.75, 212077.5, 194937, 166299, 141284.88,
130153.38, 206775, 342062.88), .Dim = c(64L, 8L), .Dimnames = list(
NULL, c("Week_Number", "Campaign_Period", "Control_Traffic",
"Control_Revenue", "VOD_Test_Traffic", "VOD_Test_Revenue",
"TV_Test_Traffic", "TV_Test_revenue")), .Tsp = c(1, 2.21153846153846,
52), class = c("mts", "ts", "matrix"))

If we are creating grouping variables based on the number of rows, we need to change the 'month2' and 'time2'.
month2<-rep(1:12,length=nrow(ts.data))
time2<-seq_len(nrow(ts.data))
res <- lm(ts.data~time2 + factor(month2))
coef(res)
# Week_Number Campaign_Period Control_Traffic Control_Revenue VOD_Test_Traffic VOD_Test_Revenue TV_Test_Traffic TV_Test_revenue
#(Intercept) 0.0000000000000213162821 3.384444444 78799.578 157220.4207 87712.9656 148735.2930 106055.5914 177326.337
#time2 0.9999999999999994448885 -0.017777778 123.605 727.3023 383.8344 966.6405 465.7336 1229.196
#factor(month2)2 0.0000000000000152835379 0.017777778 -1177.438 2858.4910 -1320.1678 -8588.9322 -1128.5669 -6726.196
#factor(month2)3 0.0000000000000008207055 0.035555556 5419.457 10544.3136 6779.6644 346.1573 7382.8661 1173.044
#factor(month2)4 0.0000000000000016587917 0.053333333 8603.018 27254.5313 12531.1633 23542.1418 12777.1325 28186.515
#factor(month2)5 0.0000000000000012801265 -0.268888889 3290.677 6985.0394 7531.8356 3638.7607 7201.7339 5969.374
#factor(month2)6 0.0000000000000020182926 -0.251111111 12858.272 5428.8610 20021.4011 4320.5682 21940.0003 7326.704
#factor(month2)7 0.0000000000000016906446 -0.233333333 1297.067 6299.8347 3690.1667 327.4537 2400.8667 -451.516
#factor(month2)8 0.0000000000000016516546 -0.015555556 1838.662 6690.3563 -832.6678 -2303.4348 445.1331 3647.310
#factor(month2)9 0.0000000000000015682557 0.002222222 -5728.743 -12651.4220 -7622.1022 -22135.3253 -8178.0006 -15978.562
#factor(month2)10 0.0000000000000003302248 0.020000000 -1715.348 -5722.0704 -2630.9367 -11870.0938 -2470.1342 -9128.055
#factor(month2)11 0.0000000000000022008184 0.037777778 1179.647 -5052.7747 1691.6289 -8744.7323 2258.9322 -5674.003
#factor(month2)12 0.0000000000000033608693 0.055555556 5039.042 4606.1469 5908.5944 4559.9012 7788.3986 6991.025

Related

How can i add Hatched polygons on a spplot in R?

I have a map which summarizes an indicator of the saturation percentage of real estate by neighborhood in Paris (Observed Price of real estate/maximum price set by law). I would like to add hatched on neighborhoods which have less than 5 observations included in my dataset.
I searched, but I couldn't find a way to do it. Any advice in the right direction is welcomed. Thanks.
Here is my code:
library(sp)
library(sf)
library(rgdal)
library(RColorBrewer)
library(raster)
library(classInt)
library(cartography)
#Importation
setwd("path")
shp <- readOGR(dsn="path/to/file",layer="l_qu_paris")
#Breaks
q10 <- classIntervals(map$saturation2, n=7, style="fixed",
fixedBreaks=c(45,69.999999, 79.9999999, 89.9999999, 99.9999999
,109.99999999, 120))
#Colors
my.palette <- colors()[c(73,26,128,10,652,92)]
#Map
##Scale
scale.parameter = 1.1
xshift = 0
yshift = 0
original.bbox = shp#bbox
edges = original.bbox
edges[1, ] <- (edges[1, ] - mean(edges[1, ])) * scale.parameter + mean(edges[1, ]) + xshift
edges[2, ] <- (edges[2, ] - mean(edges[2, ])) * scale.parameter + mean(edges[2, ]) + yshift
#Saturation
idx <- match(shp$l_qu, map$l_qu)
is.na(idx)
concordance <- map[idx, "saturation2"]
shp$saturation2 <- concordance
spplot(shp, "saturation2",col.regions=my.palette,
col = "black", lwd= 1, at = q10$brks,
main=list(label="% de saturation des meublés 1 pièce",cex=1.2,fontfamily="serif"),
xlim = edges[1, ], ylim = edges[2, ])
grid.text("Saturation moyenne (en%)", x=unit(0.95, "npc"), y=unit(0.50, "npc"), rot=90)
Here is my map:
saturation
Here is an example of a map that i would like to have:
saturation example
Here are the polygons in shapefile format: https://www.data.gouv.fr/fr/datasets/quartiers-administratifs/
And here is my dataset:
map <- structure(list(l_qu = c("Amérique", "Archives", "Arsenal", "Arts-et-Métiers",
"Auteuil", "Batignolles", "Bel-Air", "Belleville", "Bercy", "Bonne-Nouvelle",
"Chaillot", "Champs-Elysées", "Charonne", "Chaussée-d'Antin",
"Clignancourt", "Combat", "Croulebarbe", "Ecole-Militaire", "Enfants-Rouges",
"Epinettes", "Europe", "Faubourg-du-Roule", "Faubourg-Montmartre",
"Folie-Méricourt", "Gaillon", "Gare", "Goutte-d'Or", "Grandes-Carrières",
"Grenelle", "Gros-Caillou", "Halles", "Hôpital-Saint-Louis",
"Invalides", "Jardin-des-Plantes", "Javel", "La Chapelle", "Madeleine",
"Mail", "Maison-Blanche", "Monnaie", "Montparnasse", "Muette",
"Necker", "Notre-Dame", "Notre-Dame-des-Champs", "Odéon", "Palais-Royal",
"Parc-de-Montsouris", "Père-Lachaise", "Petit-Montrouge", "Picpus",
"Place-Vendôme", "Plaine de Monceaux", "Plaisance", "Pont-de-Flandre",
"Porte-Dauphine", "Porte-Saint-Denis", "Porte-Saint-Martin",
"Quinze-Vingts", "Rochechouart", "Roquette", "Saint-Ambroise",
"Saint-Fargeau", "Saint-Germain-des-Prés", "Saint-Gervais",
"Saint-Lambert", "Saint-Merri", "Saint-Thomas-d'Aquin", "Saint-Victor",
"Saint-Vincent-de-Paul", "Sainte-Avoie", "Sainte-Marguerite",
"Saint-Georges", "Salpêtrière", "Sorbonne", "Saint-Germain-l'Auxerrois",
"Ternes", "Val-de-Grâce", "Villette", "Vivienne", "Total"),
saturation2 = c(98.188951329533, 85.4938271604938, 83.8463463463464,
90.1460755525873, 98.1726527090667, 90.2186740262059, 92.8743271072797,
72.8549079897508, 99.2356140350877, 90.1234567901235, 114.057904044022,
NA, 87.2208980972528, 91.2562612612613, 97.9518951016991,
86.2770900920801, 91.0239726151895, 92.8305400372439, 88.6514719848053,
73.876877752942, 108.693318725755, 67.3263578578579, 85.8735259484408,
89.2100224414912, 92, 90.6120989320281, 85.8446948520848,
91.4165103088783, 97.2760978594495, 93.60892313074, 102.471730530348,
95.9062868379746, 96, 92.5484278273071, 95.0066946433545,
85.8187074829932, 101.139150713213, 92.1272297297297, 93.0625144594594,
61.8074324324324, 100.173302938197, 99.720856146949, 84.8732544128823,
84.1911355800245, 85.1122672253259, 91.8422003734504, NA,
94.612349767814, 83.2363741480137, 87.0403187718064, 92.0886931496388,
77, 110.943302180685, 100.73486307088, 66.3899425287356,
96.2527514568292, 95.7430893746874, 87.9028997984617, 48,
85.5630809345015, 92.7010730078939, 82.075822827797, 83.1727736726875,
76.2162162162162, 104.534662867996, 98.3510353194912, 78.3333333333333,
103.169134078212, 80.8779605984059, 92.63515704154, 62, 90.3902768982325,
94.1391771653151, 94.8669917042241, 94.4825319797959, 95.4279279279279,
98.2238673533848, 94.0602977590835, 87.5105365473892, 102,
92.5123935729199), numobs = c(6, 4, 4, 6, 36, 15, 4, 4, 3,
2, 16, NA, 36, 3, 32, 9, 22, 13, 11, 6, 31, 5, 15, 14, 4,
22, 3, 64, 29, 58, 7, 18, 4, 13, 23, 2, 8, 4, 47, 12, 16,
49, 50, 9, 33, 26, NA, 15, 10, 10, 23, 2, 13, 15, 2, 12,
8, 31, 1, 17, 22, 42, 7, 3, 4, 74, 4, 7, 13, 6, 2, 23, 18,
16, 17, 1, 24, 44, 8, 4, 1290)), row.names = c(NA, -81L), class = c("tbl_df",
"tbl", "data.frame"))
Neither spplot, nor ggplot2 support textured fillings. Having said that, there is a package called ggpattern which provides custom ggplot2 geoms which support filled areas with geometric and image-based patterns. See developer site for more info on ggpattern: https://coolbutuseless.github.io/package/ggpattern/index.html
With ggpattern you can plot 'hatched' or textured geom fillings. Below is a working example from the developers website:
library(maps)
crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests)
crimesm <- reshape2::melt(crimes, id = 1)
states_map <- map_data("state")
p <- ggplot(crimes, aes(map_id = state)) +
geom_map_pattern(
aes(
# fill = Murder,
pattern_fill = Murder,
pattern_spacing = state,
pattern_density = state,
pattern_angle = state,
pattern = state
),
fill = 'white',
colour = 'black',
pattern_aspect_ratio = 1.8,
map = states_map
) +
expand_limits(x = states_map$long, y = states_map$lat) +
coord_map() +
theme_bw(18) +
labs(title = "ggpattern::geom_map_pattern()") +
scale_pattern_density_discrete(range = c(0.01, 0.3)) +
scale_pattern_spacing_discrete(range = c(0.01, 0.03)) +
theme(legend.position = 'none')
p

Parse long string to retrieve channel_id

I have extracted a lot of data from Telegram. However, I was not able to isolate the channel_id. Now I have a long string that among a lot of other information contain channel_id. Question is how do I remove everything apart from the channel_id i.e. the numbers following "channel_id=XXXXXXXXXX)?
Subset of my data.frame
df <- structure(list(channel_id = c("MessageFwdHeader(date=datetime.datetime(2021, 5, 13, 20, 50, 47, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1292436059), from_name=None, channel_post=1404, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 5, 4, 9, 24, 16, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1480423705), from_name=None, channel_post=224, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 25, 14, 9, 38, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1489900933), from_name=None, channel_post=627, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 12, 22, 10, 3, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1455689590), from_name=None, channel_post=1457, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 9, 12, 52, 5, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1348575245), from_name=None, channel_post=None, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
Desired result
channel_id <- structure(list(channel_id = c("1292436059",
"1480423705",
"1489900933",
"1455689590",
"1348575245"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
You can try regexpr with a look behind for (channel_id= using (?<=\\(channel_id=), than match digit(s) \\d+ and look ahead for ) using (?=\\)) and extract the matches using regmatches.
regmatches(df$channel_id, regexpr("(?<=\\(channel_id=)\\d+(?=\\))"
, df$channel_id, perl=TRUE))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245"
or combining two sub.
sub(").*", "", sub(".*\\(channel_id=", "", df$channel_id))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245
We may use str_extract
library(stringr)
library(dplyr)
df %>%
transmute(channel_id = str_extract(channel_id, "(?<=channel_id\\=)\\d+"))
channel_id
1: 1292436059
2: 1480423705
3: 1489900933
4: 1455689590
5: 1348575245

subtract the values in one dataframe by the mean created by summarise in dplyr

I have two dataframes, one is the original data:
df <- structure(list(day = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), Subject = c(1, 1, 1, 1, 1, 1, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12,
12, 12, 13, 13, 13, 13, 13, 13, 17, 17, 17, 17, 17, 17), TimePoint = c(1,
5, 9, 13, 17, 21, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
1, 5, 9, 15, 17, 21, 1, 5, 9, 13, 17, 21, 1, 5, 9, 13, 17, 21,
1, 5, 9, 13, 17, 21), C4b = c(489.1, 3757.5, 4013.7, 963.4, 668.8,
2135.4, 1793.5, 2485.9, 1337.2, 1200.1, 1854.5, 1163.8, 1050.4,
1514.7, 1134, 1283.9, 1883.2, 1160, 441.4, 318.4, 261.8, 392,
380.9, 312, 1731.1, 1482.9, 1635.6, 1144.3, 1455.3, 1427.1, 1202.6,
1212.7, 1294.8, 1406.6, 2153.3, 1135, 1104.5, 1129.3, 737.4,
1010.9, 934.3, 704.2), `Coagulation Factor XI` = c(2149.4, 2055.8,
2256.3, 2179, 1859.5, 2158.4, 1877.4, 1800.7, 1838.7, 1899.9,
1867, 1827.9, 1818.7, 1847, 1807.7, 1737, 2394.9, 1726.2, 1756.3,
1746.3, 1744.5, 1573.8, 1765.5, 1954.1, 1522.4, 1387.7, 1374.4,
1357.3, 1433.3, 1386.5, 1575.7, 1531.8, 1560.9, 1810.4, 1657.2,
1509.4, 2110.4, 1974.5, 1856.1, 1939.6, 1832.3, 1932.5), CTACK = c(798.9,
857.5, 702.9, 707.6, 903.7, 749.9, 689.2, 734.1, 716.8, 645.2,
641.2, 663.1, 733.3, 667.3, 774.3, 782.2, 1766.7, 679.2, 1950.6,
2084.9, 2021.5, 1930.8, 1952.3, 1917.2, 723.9, 709.2, 670.7,
716.4, 944.9, 655.7, 706.8, 734.2, 680.5, 720.4, 778.3, 790.2,
962.5, 726.4, 761.9, 849.2, 701.1, 664.3), Endostatin = c(30563,
30006.5, 25972.2, 28048.9, 24503.9, 30371.2, 49733.9, 54933.8,
54293.3, 60007.4, 60403.9, 58870.4, 56801, 59752.6, 54336.3,
47746, 21270.4, 67312.9, 61931.5, 66395.3, 65680.8, 64419.4,
63415, 67230.7, 49444.2, 55122.2, 52333, 62328.7, 47513.9, 61530,
52549.8, 53633, 53983.7, 49023.8, 47305.6, 50569.8, 58754.1,
65727.9, 78382, 68290.8, 69386.8, 59982.4), `TIMP-1` = c(279.4,
262.4, 295.9, 273.3, 294.9, 278.3, 279.4, 238.7, 224.2, 239.3,
235.7, 251.7, 228.2, 237.3, 258.4, 262.4, 389.4, 237.8, 303.1,
296.4, 295.4, 297, 317, 351.6, 310.4, 244.2, 250.8, 261.1, 283.1,
251.4, 242.2, 223.3, 234.6, 291.2, 262.3, 249.7, 293.9, 259.4,
238.9, 257.9, 248.2, 329.3), tPA = c(377.9, 326.9, 346.3, 318.3,
314, 348.2, 291.2, 427.8, 433.1, 428.8, 370.9, 390, 354, 354.4,
368.8, 364.7, 595.2, 400.1, 354, 356.8, 368.7, 363.5, 394.3,
412.3, 319, 406.9, 415.4, 292.3, 320.9, 348.2, 532, 457.4, 417.3,
404, 401.5, 533.5, 213.6, 346.7, 310.7, 292.6, 280.2, 287.1),
`EG-VEGF` = c(106.8, 102.2, 107, 108.1, 138.8, 107.5, 88.1,
85.7, 82.7, 91.7, 98.8, 91.1, 82.3, 90.3, 89.2, 94.9, 128.3,
94.6, 529.1, 679.3, 599.4, 644.5, 674.1, 584.5, 98.2, 101.1,
97.3, 95.3, 86.8, 92.5, 101.9, 94.9, 109, 94.1, 103.8, 98.2,
92, 98.8, 91.6, 92.9, 98.8, 105.8), `TIMP-2` = c(466.5, 420.7,
479.8, 423, 462.7, 393.1, 400.5, 380.3, 355.9, 393.7, 423.6,
401.6, 364.2, 372.8, 391.3, 480.6, 1171.4, 357.8, 533.7,
522.6, 523.1, 511.5, 511, 644.8, 405.3, 398.2, 393, 369.7,
427.8, 380, 396.3, 394.5, 407, 490.9, 435.6, 402.7, 424.3,
436, 391.6, 405.6, 414.2, 536.2), `TGF-b1` = c(1451.2, 1194.6,
1521.3, 1705.8, 1945.4, 1900.2, 1839.4, 1701.9, 1752.9, 1714.6,
1597.6, 1660.5, 1786.9, 1694, 1519.6, 1906.1, 1654.2, 1636,
1566.8, 1730.9, 1496.2, 1850.9, 1715.1, 1648.8, 1834.9, 1686,
1769.4, 1750.8, 1833, 1862.6, 1511, 1505.9, 1524.9, 1514,
1391.2, 1369.1, 2064.6, 1790.9, 1799.1, 1548.2, 1839, 1667
), `VEGF sR3` = c(3094.3, 3235, 3116.3, 3027.1, 2536, 3061.8,
3215.7, 3189.6, 3139.1, 3618.8, 3585.6, 3556, 3326.7, 3167.5,
3403.3, 2583.1, 1621.6, 3415.2, 2293.9, 2508.4, 2662.6, 2486.2,
2538.8, 2452.5, 3012.5, 2920.1, 2914.1, 2870, 2455.2, 2809.3,
3534.8, 3501.9, 3691.4, 3517.6, 3414, 3405.6, 1943.3, 2334.2,
2137.4, 2185.7, 2273.2, 2304.9), C5 = c(5566.9, 5466.7, 5591.6,
5552.7, 5348.3, 5388.4, 5499.1, 5834.1, 5556.7, 5737.5, 5632.2,
5886.4, 5543.2, 6134.6, 5597.3, 5557.9, 5446.3, 5863.7, 5699.8,
5263.7, 5773.3, 5313.7, 6014.2, 5453.3, 6284.5, 5658.5, 5491.2,
5855.1, 5477.2, 5815.2, 5938.1, 5660.6, 6190.9, 5626.5, 6248.6,
5673, 6071.4, 6262.5, 5649.1, 6052, 6000.4, 5649), `Apo E` = c(3351.8,
936.5, 928.1, 3096.5, 2267.6, 2217.1, 2143.2, 2547.1, 2368.6,
2531.5, 1922.4, 2134.3, 2283.7, 2131, 2260.3, 2249.8, 2001.2,
2271.8, 3635.3, 2338.7, 2301.2, 2705, 2604.3, 2738.5, 5091.4,
3638.9, 2710.4, 3605, 3683, 3016.1, 3698.2, 3050.5, 2162.1,
2086.9, 2500.5, 2449.1, 4416, 2628.9, 2902.4, 3100.8, 2433.7,
3083.3), BDNF = c(325, 324, 337.9, 436.6, 557.3, 379.5, 556.2,
458.3, 400.3, 447.4, 448.1, 450.3, 425.1, 456.4, 482.1, 585,
663.2, 424.9, 343.8, 337.2, 328.1, 417.9, 343.2, 386.1, 590.6,
359.9, 340.3, 360.5, 394.1, 374.6, 367.9, 294.1, 319.4, 320,
366.2, 372.1, 502.1, 403.1, 432.9, 397.3, 389, 364.5), `bFGF-R` = c(12689.3,
13327.1, 12159.8, 11959.7, 11430.1, 11628.7, 11430.7, 12808.9,
11439, 12836.8, 13735.2, 12351.3, 11754, 12071.3, 11841.7,
10368.6, 12122.4, 12371.7, 11184.5, 11499.1, 11687.9, 10997.4,
11006.7, 10709.3, 11615, 12553.3, 11459.9, 12403, 10952,
12060.3, 13330.8, 12688.8, 13717.2, 11868.7, 11919.1, 11584.5,
10987.7, 12370.3, 11619.4, 11737.2, 12695.7, 12403.2), C8 = c(1868.7,
1678.4, 1916.9, 1966.6, 1724.9, 2028, 1840.3, 1797.4, 1871.4,
1967.9, 1926.4, 1860.2, 1860.4, 1901.4, 1951.1, 1675.4, 1026,
2024.3, 1950.6, 2174.5, 2266.5, 2251.5, 2176.4, 2067.2, 1968,
2187.1, 2360.5, 2057.5, 1971.1, 2202.2, 2099.5, 2337.2, 2320.8,
2232.9, 2358.2, 2374.8, 2071.3, 2773.3, 2695.9, 2641.2, 2738.7,
2607), `Cathepsin G` = c(1434.9, 1347, 1410.5, 1421.1, 2318.2,
1675, 1025.9, 1217.4, 1021.9, 1088.4, 1034.2, 1099.8, 1080.1,
1051.2, 967.8, 3874.4, 1875.6, 1058.9, 1093.7, 1083.2, 922.1,
1204.4, 1126.1, 1157.9, 1642.2, 1528.4, 1627.3, 1811.6, 1498.4,
1759.8, 926.3, 1022.4, 1106.7, 1058.7, 999.1, 995.3, 1020.5,
1085.4, 1088.9, 961.9, 1135.7, 1080.7), `CXCL16, soluble` = c(6878.8,
7479.3, 7088.7, 6739.4, 6143.9, 7477.5, 6742, 6822.4, 6164.9,
7320.9, 7459.1, 7150.5, 6407.1, 6504, 6901.8, 5929.6, 3924.1,
7827.3, 7358.7, 7758.3, 8305.5, 7586.9, 7829.1, 8268.3, 6860.7,
7470.7, 6857, 7003.7, 6364.7, 7738.7, 5441.6, 5609.3, 5732.5,
5128.2, 5642.7, 5402.4, 5170.2, 6272, 5963, 5996.4, 6461,
6721.3), `FGF-10` = c(255.2, 210, 262.6, 303.6, 219.4, 301.6,
241.2, 283.5, 248.6, 260.1, 265.7, 299.5, 268.9, 265.3, 201.3,
190.3, 296.6, 239.5, 242.7, 287.3, 216.2, 288.8, 283.8, 246.7,
255.1, 268.6, 236.3, 258.4, 206.8, 259.4, 264.1, 233.2, 277.1,
205.7, 200, 193.1, 251, 228, 229.9, 213.6, 247.2, 235.2),
`FGF-8B` = c(712.8, 648.4, 704.5, 763.9, 837.4, 795.9, 836.1,
841.7, 872.4, 860.1, 921.1, 845.8, 843.6, 875.2, 850, 847.4,
934.7, 938.6, 715.6, 823.9, 733.3, 801.1, 800.3, 819.9, 791.1,
930.8, 938.3, 869.6, 794.8, 939.9, 925.9, 971.2, 1015.9,
928.8, 851.9, 827, 833.6, 837.7, 858, 828.2, 935.7, 900.5
), GIIE = c(270, 254.6, 268.7, 279, 224.4, 293.2, 239.8,
238, 222.9, 260.2, 282.3, 262.3, 239, 241, 238.7, 200.6,
207.3, 252, 270.8, 281.1, 354.2, 289.2, 293, 295.2, 274.5,
291.4, 270.6, 275.8, 232, 272.7, 267, 245.3, 278.3, 260.6,
264, 250.6, 232.9, 274.4, 256.2, 254.1, 271.2, 282), GV = c(433.7,
441.1, 438.9, 468, 425.6, 459.2, 317.6, 332.2, 326, 306.4,
307.4, 310.6, 347.9, 317, 273.1, 325.9, 798.4, 299.1, 327.8,
307.9, 258.6, 308.7, 306.6, 298.6, 319.2, 326.2, 299.8, 329,
436.5, 297.7, 320.9, 306.6, 314.1, 312.1, 298.1, 300.1, 417,
306.6, 314.7, 321.4, 304.8, 305), `IL-12` = c(725.7, 667,
734.8, 772.8, 1045.1, 829.4, 659.4, 695.3, 653.8, 672.8,
701.1, 658.1, 683.5, 670.8, 678.8, 1002.5, 991.4, 703.8,
667.9, 714.1, 630.6, 720, 689.8, 781.4, 671.2, 715.2, 748.9,
693.2, 723.5, 724.7, 868.6, 891.2, 917.9, 858.8, 868.9, 828.6,
744.7, 711.9, 715.1, 683.2, 740.2, 724.3)), row.names = c(NA,
-42L), class = c("tbl_df", "tbl", "data.frame"))
My original data has an enormous amount of variables (1,130) of which I used the library(dplyr) summarise function to create mean values by TimePoint for each of the columns.
df_mean <- df %>%
group_by(TimePoint) %>%
summarise_at(vars(C4b:GV), mean, na.rm = T)
Now I want to create a new dataframe that is simply the difference between the original values in df and the mean of each time point for all the variables so essentially df - df_mean = new dataframe except for the variables "day", "Subject", and "TimePoint".
Thank you!
scale() function from R Base, with scale = FALSE for the centering option
mutate_at() to apply to multiple variables at once
vars() to select list of variables
! to select complement of variables - you don't want it applied to day and Subject
library(tidyverse)
df_centered <- df %>%
group_by(TimePoint) %>%
mutate_at(vars(!c("day", "Subject")), ~scale(., scale = FALSE), na.rm = TRUE)
I tried something like this, hope it helps.
df1 <- df[,-c(1,2,25)]
df_mm <- full_join(df1, df_mean, by = "TimePoint")
df_ss <- NULL
names <- colnames(df)[-length(df)]
for(i in 2:length(df1)){
df_ss[[i]] <- (df_mm[,i] - df_mm[,length(df1)+i-1])
}
df_ss[1] <- NULL
df_ss <- do.call(cbind, df_ss)
df_ss <- mutate(df_ss,
day = df$day,
Subject = df$Subject,
TimePoint = df$TimePoint)
df_ss <- df_ss[,c(22:24,1:21)]
colnames(df_ss) <- names
View(df_ss)

Extract cluster information and combine results

I am attempting to run a clustering algorithm over a list of dissimilarity matrices for different numbers of clusters k and extract some information for each run.
This first block of code produces the list of dissimilarity matrices
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
Here is the section of code I am having a problem with
set.seed(4)
model_num <-c(NA)
sil_width <-c(NA)
min_sil<-c(NA)
mincluster<-c(NA)
k_clusters <-c(NA)
lowest_sil <-c(NA)
maxcluster <-c(NA)
model_vars <- c(NA)
clust_4=lapply(gower_ls,pam,diss=TRUE,k=4)
for(m in 1:length(clust_4)){
sil_width[m] <-clust_4[[m]][7]$silinfo$avg.width
min_sil[m] <- min(clust_4[[m]][7]$silinfo$clus.avg.widths)
mincluster[m] <-min(clust_4[[m]][6]$clusinfo[,1])
maxcluster[m] <-max(clust_4[[m]][6]$clusinfo[,1])
k_clusters[m]<- nrow(clust_4[[m]][6]$clusinfo)
lowest_sil[m]<-min(clust_4[[m]][7]$silinfo$widths)
model_num[m] <-m
}
colresults_4=as.data.frame(cbind( sil_width, min_sil,mincluster,maxcluster,k_clusters,model_num,lowest_sil))
How can I convert this piece of code to run for a given range of k? I've tried a nested loop but I was not able to code it correctly. Here are the desired results for k= 4:6, thanks.
structure(list(sil_width = c(0.766467312788453, 0.543226669407726,
0.765018469447229, 0.705326458357873, 0.698351173575526, 0.480565022092276,
0.753366365875066, 0.644345251543097, 0.699437672202048, 0.430310752506775,
0.678224885117295, 0.576411380463116), min_sil = c(0.539324315243191,
0.508330909368204, 0.637090842537915, 0.622120627356455, 0.539324315243191,
0.334047777245833, 0.430814518122641, 0.568591550281139, 0.539324315243191,
0.295113900268025, 0.430814518122641, 0.19040716086259), mincluster = c(5,
3, 4, 5, 2, 3, 3, 3, 2, 3, 3, 3), maxcluster = c(14, 12, 11,
14, 12, 10, 11, 11, 9, 6, 7, 7), k_clusters = c(4, 4, 4, 4, 5,
5, 5, 5, 6, 6, 6, 6), model_num = c(1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4), lowest_sil = c(-0.0726256983240229, 0.0367238314801671,
0.308069836672298, 0.294247157041013, -0.0726256983240229, -0.122804288130541,
-0.317748917748917, 0.218164082936686, -0.0726256983240229, -0.224849074123824,
-0.317748917748917, -0.459909237820881)), row.names = c(NA, -12L
), class = "data.frame")
I was able to come up with a solution by writing a function clus_func that extracts the cluster information and then using cross2 and map2 from the purrr package:
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
clus_func=function(x,k){
clust=pam(x,k,diss=TRUE)
clust_stats=as.data.frame(cbind(
avg_sil_width=clust$silinfo$avg.width,
min_clus_width=min(clust$silinfo$clus.avg.widths),
min_individual_sil=min(clust$silinfo$widths[,3]),
max_individual_sil=max(clust$silinfo$widths[,3]),
mincluster= min(clust$clusinfo[,1]),
maxcluster= max(clust$clusinfo[,1]),
num_k=max(clust$clustering) ))
}
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
begin_k=4
end_k=6
cross_list=cross2(gower_ls,begin_k:end_k)
k=c(NA)
for(i in 1:length(cross_list)){ k[i]=cross_list[[i]][2]}
diss=c(NA)
for(i in 1:length(cross_list)){ diss[i]=cross_list[[i]][1]}
model_stats=map2(diss, k, clus_func)
model_stats=rbindlist(model_stats)

Assign values to subset of dataframe in list based on matching values of subset in two lists

I have two lists (the dataframes in the list contain more columns than those two, but they are not important for my question):
KPI_new <- list(June=data.frame(ID=(rep("",17)), eRec= c("107349", "110878", "110024", "112188", "6187", "100420", "94436", "110165", "108508", "108773", "111859", "111907", "110704", "100413", "88995", "91644","111298") ))
KPI_old <- list(May=data.frame(ID=c(27, 30, 4, 6, 7, 20, 31, 8, 28, 25, 29, 16, 17, 18), eRec = c( "107349", "110024", "6187" , "100420", "94436", "88995" , "110165" ,"91644", "108508", "105213", "108773", "102636" ,"102339" ,"100413")),
April = data.frame(ID=c(26, 27, 2, 4, 5, 6, 7, 20, 21, 22, 8, 23, 28, 25, 29, 9, 24, 16, 17, 18), eRec=c("37866", "107349", "93051", "6187", "98274", "100420", "94436", "88995" ,"105107", "105109", "91644", "105103" ,"108508" ,"105213", "108773", "85409" ,"104145","102636" ,"102339" ,"100413")),
March = data.frame(ID= c(2, 19, 4, 5, 6, 7, 20, 21, 22, 8, 23, 25, 9, 24, 15, 16, 17, 18), eRec=c("93051" , "104499" ,"6187", "98274", "100420" ,"94436", "88995" ,"105107" ,"105109", "91644" ,"105103", "105213" ,"85409" , "104145", "100989", "102636" ,"102339", "100413")),
February = data.frame(ID= c(1 , 2, 19, 4, 5, 6, 7 ,20, 21, 22, 8, 23, 9 ,10, 24, 12, 13, 14, 15, 16, 17, 18), eRec=c("94266" , "93051", "104499" ,"6187" , "98274", "100420", "94436" ,"88995", "105107", "105109", "91644" ,"105103", "85409" ,"102252", "104145", "94559", "101426", "100992" ,"100989" ,"102636" ,"102339" ,"100413")),
January = data.frame(ID = c(1:18), eRec=c("94266" , "93051", "99836", "6187" , "98274", "100420", "94436", "91644", "85409", "102252", "94412", "94559", "101426", "100992", "100989", "102636", "102339", "100413")))
The list KPI_old contains several dataframes. The ID column is assigned based on the eRec column. So if the eRec column exists in January and in February also, the ID is the same.
Now I want to assign IDs to the (at this point empty) ID column of the dataframe in the KPI_new list based on KPI_old.
I tried the following:
KPI_old_df <- do.call("rbind", KPI_old)
KPI_new[[1]]$ID[(KPI_new[[1]][,2]) %in% KPI_old_df[,2]] <- unique(KPI_old_df$ID[(KPI_old_df[,2]) %in% KPI_new[[1]][,2]])
This assigns the right values - the IDs of KPI_old to KPI_new for the eRec values in KPI_new which already occur in KPI_old - but it assigns some of them to the wrong rows. The order is not right.
It seems like there is something very basic which I am missing.
Thanks in advance.
Try using match in the following way
KPI_new[[1]]$ID <- KPI_old_df$ID[match(KPI_new[[1]]$eRec, KPI_old_df$eRec)]
KPI_new
#$June
# ID eRec
#1 27 107349
#2 NA 110878
#3 30 110024
#4 NA 112188
#5 4 6187
#6 6 100420
#7 7 94436
#8 31 110165
#9 28 108508
#10 29 108773
#11 NA 111859
#12 NA 111907
#13 NA 110704
#14 18 100413
#15 20 88995
#16 8 91644
#17 NA 111298
Not all IDs are present in KPI_old_df, hence some of them return NA.

Resources