Parse long string to retrieve channel_id - r

I have extracted a lot of data from Telegram. However, I was not able to isolate the channel_id. Now I have a long string that among a lot of other information contain channel_id. Question is how do I remove everything apart from the channel_id i.e. the numbers following "channel_id=XXXXXXXXXX)?
Subset of my data.frame
df <- structure(list(channel_id = c("MessageFwdHeader(date=datetime.datetime(2021, 5, 13, 20, 50, 47, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1292436059), from_name=None, channel_post=1404, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 5, 4, 9, 24, 16, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1480423705), from_name=None, channel_post=224, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 25, 14, 9, 38, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1489900933), from_name=None, channel_post=627, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 12, 22, 10, 3, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1455689590), from_name=None, channel_post=1457, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)",
"MessageFwdHeader(date=datetime.datetime(2021, 3, 9, 12, 52, 5, tzinfo=datetime.timezone.utc), imported=False, from_id=PeerChannel(channel_id=1348575245), from_name=None, channel_post=None, post_author=None, saved_from_peer=None, saved_from_msg_id=None, psa_type=None)"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))
Desired result
channel_id <- structure(list(channel_id = c("1292436059",
"1480423705",
"1489900933",
"1455689590",
"1348575245"
)), row.names = c(NA, -5L), class = c("data.table", "data.frame"))

You can try regexpr with a look behind for (channel_id= using (?<=\\(channel_id=), than match digit(s) \\d+ and look ahead for ) using (?=\\)) and extract the matches using regmatches.
regmatches(df$channel_id, regexpr("(?<=\\(channel_id=)\\d+(?=\\))"
, df$channel_id, perl=TRUE))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245"
or combining two sub.
sub(").*", "", sub(".*\\(channel_id=", "", df$channel_id))
#[1] "1292436059" "1480423705" "1489900933" "1455689590" "1348575245

We may use str_extract
library(stringr)
library(dplyr)
df %>%
transmute(channel_id = str_extract(channel_id, "(?<=channel_id\\=)\\d+"))
channel_id
1: 1292436059
2: 1480423705
3: 1489900933
4: 1455689590
5: 1348575245

Related

how to properly sum rows based in an specific date column rank?

The idea is to get the sum based on the column names that are
between 01/01/2021 and 01/08/2021:
# define rank parameters {start-end}
first_date <- format(Sys.Date(), "01/01/%Y")
actual_date <- format(Sys.Date() %m-% months(1), "01/%m/%Y")
# get the sum of the rows between first_date and actual_date
df$ytd<- rowSums(df[as.character(seq(first_date,
actual_date))])
However, when applied the next error arises:
Error in seq.default(first_date, to_date) :
'from' must be a finite number
Expected output is a new column that takes the sum of the rows between the specified rank.
data
df <- structure(list(country = c("Mexico", "Mexico", "Mexico", "Mexico"
), `01/01/2021` = c(12, 23, 13, 12), `01/02/2021` = c(12, 23,
13, 12), `01/03/2021` = c(12, 23, 13, 12), `01/04/2021` = c(12,
23, 13, 12), `01/05/2021` = c(12, 23, 13, 12), `01/06/2021` = c(12,
23, 13, 12), `01/07/2021` = c(12, 23, 13, 12), `01/08/2021` = c(12,
23, 13, 12), `01/09/2021` = c(12, 23, 13, 12), `01/10/2021` = c(12,
23, 13, 12), `01/11/2021` = c(12, 23, 13, 12), `01/12/2021` = c(12,
23, 13, 12)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
How could I properly apply a function to get this output?
The format and seq don't work i.e. seq expects a Date class whereas the format is a character class. Instead, make use of the range operator in across or select
library(dplyr)
out <- df %>%
mutate(ytd = rowSums(across(all_of(first_date):all_of(actual_date))))
-output
> out$ytd
[1] 96 184 104 96
A base R approach using match -
df$ytd <- rowSums(df[match(first_date, names(df)):match(actual_date, names(df))])
df$ytd
#[1] 96 184 104 96

Split Column in 3 columns with R

I'm trying to separate a column into 3 columns.
My code:
library(dplyr)
library(tidyr)
table1 <- read.csv("tablepartipants.csv")
table2 <- tidyr::separate(table1, col = unique_participant, into = c("uID", "gender", "employment"), sep='.')
I always get this error: Expected 3 pieces. Additional pieces discarded in 80 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
This is how the column dataset looks like
All 3 "new" columns are empty...
Remove the sep part of your command. A period is the default and . is a special character.
# Example data.frame
table1 <- data.frame(unique_participant = paste0(30:33, c('.male.Student', '.female.Student')))
One option
separate(table1, unique_participant, into = c("uID", "gender", "employment"))
Or using \\. to specify a period.
separate(table1, unique_participant, into = c("uID", "gender", "employment"), sep = '\\.')

mapping over lists with `ends_with` to apply a custom error function

I have a list which looks like:
I am trying to map over it and use the mutate function to apply a custom function. The list is called results and I want to compute an error between the preds and another column in the data frame. The common theme of that column in all the lists is the 1 at the very end of one of the columns.
How can I compute my custom function using contain, ends_with or something similar? The column preds is the same in all data frames.
rse <- function(x, y){
sqrt((x - y)**2)
}
x <- map(results, ~mutate(
error = rse(ends_with("1"), preds)
))
Data:
list(`c(5, 19)` = structure(list(date = structure(c(16801, 16802,
16803, 16804, 16805, 16806), class = "Date"), year = c(2016,
2016, 2016, 2016, 2016, 2016), c_farolillo = c(17, 9, 8, 3, 4,
4), plaza_eliptica = c(25, 29, 18, 11, 13, 9), c_farolillo1 = c(17,
9, 8, 3, 4, 4), preds = c(7.08282661437988, 9.66606140136719,
5.95918273925781, 3.81649804115295, 4.26900291442871, 3.38829565048218
)), row.names = c(NA, 6L), class = "data.frame"), `c(7, 1, 2, 18)` = structure(list(
date = structure(c(16801, 16802, 16803, 16804, 16805, 16806
), class = "Date"), year = c(2016, 2016, 2016, 2016, 2016,
2016), pza_del_carmen = c(12, 10, 10, 6, 8, 4), pza_de_espana = c(28,
21, 14, 8, 10, 6), escuelas_aguirre = c(17, 24, 19, 20, 22,
16), retiro = c(6, 5, 7, 3, 2, 2), pza_del_carmen1 = c(12,
10, 10, 6, 8, 4), preds = c(15.3020477294922, 16.007848739624,
15.3953952789307, 9.59985256195068, 9.85349082946777, 8.42792892456055
)), row.names = c(NA, 6L), class = "data.frame"))
We loop over the list of data.frames ('results') with map, then use mutate_at to modify the columns with names that ends_with "1" by applying rse function while speciying the 'y' as 'preds' column
library(dplyr)
library(purrr)
results <- map(results, ~ .x %>%
mutate_at(vars(ends_with("1")), list(new = ~ rse(., y = preds))))

can't add labels to my graph

I have this graph:
I just need to add labels to each colored line.
I need to add to the blue one Forecast Sales and for the red one Historical Sales.
I tried to adapt these examples here but I have much error. Also, I can not plot the graph above just by using this code:
to make it reproductible :
dput(df1)
structure(list(Semaine = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31), M = c(5649.96284329564, 7400.19639744335, 6948.61488673139,
5043.28209277238, 7171.29719525351, 7151.04746494067, 5492.96601941748,
6796.1160130719, 5532.95496473142, 7371.33061889251, 5462.73861171367,
7156.01570964247, 5558.63194819212, 9329.49289405685, 5770.02903225806,
7348.68497576737, 5261.26655896607, 8536.11304909561, 7463.97630586968,
6133.49774339136, 7252.69089929995, 6258.54674403611, 8167.67766497462,
5644.66612816371, 7512.5169628433, 5407.84275713516, 7795.63220247711,
5596.75282714055, 7264.37264404954, 5516.98492191707, 8188.80776699029
> dput(df2)
structure(list(Semaine = c(32, 33.2, 34.4, 35.6, 36.8, 38), M = c(5820.32304669441,
6296.32038834951, 7313.24757281553, 7589.714214588, 8992.35922330097,
9664.95469255663)), .Names = c("Semaine", "M"), row.names = c(NA,
-6L), class = "data.frame")
ggplot() + geom_line(data=df1, aes(x = Semaine, y = M),color = "red") +
stat_smooth(data=df2, aes(x = Semaine, y = M),color = "blue")+
scale_x_continuous(breaks = seq(0,40,1))
Thank you!
cols <- c("A"="red", "B"="blue")
ggplot() + geom_line(data=df1, aes(x = Semaine, y = M,color = "A")) +
stat_smooth(data=df2, aes(x = Semaine, y = M,color = "B"), method = 'loess')+
scale_x_continuous(breaks = seq(0,40,1)) +
scale_color_manual(name="Title", values=cols)

Variable lengths differ error when using lm in R

I am trying to run a linear model on time and a monthly factor, however I am getting the error:
Error in model.frame.default(formula = ts.data ~ time2 + factor(month2), :
variable lengths differ (found for 'time2')
This is how I created the variables:
time2<-seq(along=ts.data)
month2<-rep(1:12,length=length(ts.data))
However running length(time2), length(month2) and length(ts.data) gives the same number, does anyone know how I fix the error?
Trying to run lm :
lm(ts.data~time2+factor(month2))
The data I am using:
structure(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 78238, 73928, 70708, 75175, 70744, 65604, 61227, 62635,
47652, 51507, 81874, 98236, 99401, 94756, 94697, 93732, 100334,
139355, 88575, 94169, 86084, 98249, 95321, 87822, 80256, 81875,
86293, 80712, 79533, 82847, 84498, 84185, 78382, 82701, 80491,
91140, 86847, 96727, 101295, 99450, 87783, 101246, 97913, 100081,
96346, 93608, 90648, 99105, 90920, 84960, 82591, 88090, 89980,
87778, 87429, 81898, 77285, 80369, 73193, 65139, 60126, 57219,
94204, 112472, 157199, 154791.5, 154294.88, 161920.63, 147408.75,
134418, 132158.5, 104572.5, 96831, 91045.88, 141182.63, 214759.25,
216647, 184598.38, 210794.38, 182403.75, 193001.63, 176807.38,
186552.63, 201375.88, 181861.25, 193234.88, 187240.25, 168242,
172475.13, 188996.25, 179663.88, 192861.63, 187461.25, 188670.5,
198826.25, 208696.5, 180490.75, 202265.88, 187966.13, 203342.13,
194850.38, 230582.63, 212517, 223432.5, 196511.63, 229582.25,
206120.63, 225629.88, 209769.63, 210797.63, 213215.75, 215144.88,
223266, 230747.63, 228573, 223828.88, 202102.88, 192863.63, 206675.13,
195647.5, 173897.25, 183788.88, 158511.38, 138559, 114163.25,
110399.13, 164751.13, 270772, 90430, 81719, 79183, 85428, 79372,
72361, 66207, 55403, 51693, 60280, 98698, 123059, 121550, 107662,
107863, 107630, 114685, 169659, 100104, 107598, 97728, 112850,
107784, 97580, 92709, 99098, 99482, 100543, 98856, 106081, 108248,
104769, 96966, 100093, 103107, 114944, 108001, 126289, 135213,
129717, 121688, 134421, 127318, 127412, 121922, 119045, 116989,
126286, 116707, 106627, 98219, 111225, 117279, 113725, 114633,
100633, 95478, 98394, 87616, 75329, 68274, 70658, 122995, 145224,
155833, 131896.5, 138340.63, 145610, 130653.13, 122562.75, 115850.5,
91749.88, 81787.13, 85457.5, 142931.63, 214970, 216836.63, 175902,
180757.88, 175233.63, 168982.13, 168727.25, 173501, 182731.38,
152260.63, 182607, 179326.5, 157693.13, 161004.75, 172990.5,
166204.38, 175172.63, 186446.13, 202645.38, 202500.25, 204148.38,
187763.5, 207269.75, 183334.88, 206552.5, 207270.13, 226123.88,
239037.88, 214656.38, 216552.75, 231406.75, 207365.63, 217873.5,
200308.88, 201696.5, 208984.75, 227723.38, 212083.25, 206262.38,
186596.25, 215496.63, 199399, 184933.25, 195925.63, 190318.63,
170375.38, 171624.13, 154537.13, 133532.25, 119179.13, 113297.88,
174946.5, 304690, 108567, 99358, 97299, 103628, 96936, 89254,
83761, 72058, 66685, 74491, 117292, 139878, 139585, 130180, 130079,
127562, 136152, 197149, 118619, 127875, 118094, 134989, 130688,
121475, 112367, 114805, 117087, 118526, 118038, 123988, 127511,
125790, 116702, 123049, 124260, 141232, 133809, 156349, 162637,
158367, 144491, 164389, 155305, 161401, 151829, 144188, 142702,
156405, 141937, 129857, 120318, 132823, 138201, 135058, 129275,
119897, 112924, 120385, 108134, 94062, 86695, 88434, 145426,
167100, 184196.63, 166628.38, 168193.13, 190280.88, 154984.75,
153784.38, 148033.75, 121304.25, 107303.25, 108003.13, 168770,
240983.75, 242817.13, 220229.38, 222805.75, 205068.75, 205204.25,
192598.25, 206565.38, 227284.88, 199258.25, 227122.88, 209076.13,
194855.5, 196357.25, 206865.88, 209580.13, 222190.88, 234610.88,
229339.13, 219321.63, 232571.75, 218584.75, 246116.38, 229563,
256776.75, 257335.25, 271507, 272014, 265850.5, 253426.63, 291759.63,
262608.88, 279417.25, 264583.25, 256634.88, 271024.88, 283927.13,
270597.38, 264222.5, 235009.13, 258379.25, 246485.5, 240163.25,
238369.88, 240961.5, 219826.75, 212077.5, 194937, 166299, 141284.88,
130153.38, 206775, 342062.88), .Dim = c(64L, 8L), .Dimnames = list(
NULL, c("Week_Number", "Campaign_Period", "Control_Traffic",
"Control_Revenue", "VOD_Test_Traffic", "VOD_Test_Revenue",
"TV_Test_Traffic", "TV_Test_revenue")), .Tsp = c(1, 2.21153846153846,
52), class = c("mts", "ts", "matrix"))
If we are creating grouping variables based on the number of rows, we need to change the 'month2' and 'time2'.
month2<-rep(1:12,length=nrow(ts.data))
time2<-seq_len(nrow(ts.data))
res <- lm(ts.data~time2 + factor(month2))
coef(res)
# Week_Number Campaign_Period Control_Traffic Control_Revenue VOD_Test_Traffic VOD_Test_Revenue TV_Test_Traffic TV_Test_revenue
#(Intercept) 0.0000000000000213162821 3.384444444 78799.578 157220.4207 87712.9656 148735.2930 106055.5914 177326.337
#time2 0.9999999999999994448885 -0.017777778 123.605 727.3023 383.8344 966.6405 465.7336 1229.196
#factor(month2)2 0.0000000000000152835379 0.017777778 -1177.438 2858.4910 -1320.1678 -8588.9322 -1128.5669 -6726.196
#factor(month2)3 0.0000000000000008207055 0.035555556 5419.457 10544.3136 6779.6644 346.1573 7382.8661 1173.044
#factor(month2)4 0.0000000000000016587917 0.053333333 8603.018 27254.5313 12531.1633 23542.1418 12777.1325 28186.515
#factor(month2)5 0.0000000000000012801265 -0.268888889 3290.677 6985.0394 7531.8356 3638.7607 7201.7339 5969.374
#factor(month2)6 0.0000000000000020182926 -0.251111111 12858.272 5428.8610 20021.4011 4320.5682 21940.0003 7326.704
#factor(month2)7 0.0000000000000016906446 -0.233333333 1297.067 6299.8347 3690.1667 327.4537 2400.8667 -451.516
#factor(month2)8 0.0000000000000016516546 -0.015555556 1838.662 6690.3563 -832.6678 -2303.4348 445.1331 3647.310
#factor(month2)9 0.0000000000000015682557 0.002222222 -5728.743 -12651.4220 -7622.1022 -22135.3253 -8178.0006 -15978.562
#factor(month2)10 0.0000000000000003302248 0.020000000 -1715.348 -5722.0704 -2630.9367 -11870.0938 -2470.1342 -9128.055
#factor(month2)11 0.0000000000000022008184 0.037777778 1179.647 -5052.7747 1691.6289 -8744.7323 2258.9322 -5674.003
#factor(month2)12 0.0000000000000033608693 0.055555556 5039.042 4606.1469 5908.5944 4559.9012 7788.3986 6991.025

Resources