I have a dataset that looks like this:
> head(df)
# A tibble: 6 × 3
id tstart tstop
<dbl> <dttm> <dttm>
1 115 2016-01-04 19:14:06 2016-01-04 19:14:15
2 115 2016-01-04 19:14:15 2016-01-04 19:14:16
3 115 2016-01-04 19:14:16 2016-01-04 20:00:00
4 115 2016-01-04 20:00:00 2016-01-04 23:32:06
5 119 2016-01-09 12:56:49 2016-01-09 13:09:38
6 119 2016-01-09 19:21:30 2016-01-09 19:26:48
> dput(df)
structure(list(id = c(115, 115, 115, 115, 119, 119, 119, 119,
115, 119, 115, 115, 119, 119, 115, 115, 115, 115, 119, 115, 115,
119, 119, 115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119, 115, 119, 119, 115, 119, 119, 115, 119, 115, 115, 115, 115,
115), tstart = structure(c(1451960046, 1451960055, 1451960056,
1451962800, 1452369409, 1452392490, 1452656773, 1452768075, 1453117929,
1453158614, 1453211410, 1453241664, 1453472208, 1453501656, 1453683210,
1453859618, 1453923350, 1454160212, 1454185221, 1454334295, 1454667974,
1454893810, 1455228853, 1455498598, 1455551174, 1455586503, 1455652857,
1455747333, 1455965433, 1456053421, 1456137889, 1456482398, 1456590733,
1456839351, 1456945452, 1457003430, 1457099049, 1457108703, 1457445523,
1457478749, 1457480525, 1457542159, 1457562948, 1458598425, 1458822311,
1458940977, 1459028316, 1459083563), class = c("POSIXct", "POSIXt"
), tzone = ""), tstop = structure(c(1451960055, 1451960056, 1451962800,
1451975526, 1452370178, 1452392808, 1452656986, 1452768517, 1453118186,
1453158918, 1453211770, 1453242132, 1453472619, 1453502485, 1453683500,
1453859899, 1453923567, 1454161008, 1454185580, 1454334848, 1454668930,
1454894182, 1455229448, 1455499217, 1455552432, 1455587211, 1455653538,
1455747987, 1455965658, 1456053774, 1456138469, 1456482801, 1456591336,
1456839506, 1456945790, 1457003644, 1457099216, 1457109800, 1457445783,
1457480525, 1457480533, 1457542907, 1457563544, 1458598877, 1458822887,
1458941209, 1459028558, 1459083990), class = c("POSIXct", "POSIXt"
))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-48L), .Names = c("id", "tstart", "tstop"))
> head(df)
# A tibble: 6 × 3
id tstart tstop
<dbl> <dttm> <dttm>
1 115 2016-01-04 19:14:06 2016-01-04 19:14:15
2 115 2016-01-04 19:14:15 2016-01-04 19:14:16
3 115 2016-01-04 19:14:16 2016-01-04 20:00:00
4 115 2016-01-04 20:00:00 2016-01-04 23:32:06
5 115 2016-01-18 04:52:09 2016-01-18 04:56:26
6 115 2016-01-19 06:50:10 2016-01-19 06:56:10
I'm trying to create an event sequence, event.seq, where an event is defined as the continuation in time of the previous row. The sequence resets at every id change. The end dataframe I'm trying to get is:
> dput(df.out)
structure(list(id = c(115, 115, 115, 115, 115, 115, 115, 115,
115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115,
115, 115, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
119), tstart = structure(c(1451960046, 1451960055, 1451960056,
1451962800, 1453117929, 1453211410, 1453241664, 1453683210, 1453859618,
1453923350, 1454160212, 1454334295, 1454667974, 1455498598, 1455551174,
1457003430, 1457445523, 1457542159, 1458598425, 1458822311, 1458940977,
1459028316, 1459083563, 1452369409, 1452392490, 1452656773, 1452768075,
1453158614, 1453472208, 1453501656, 1454185221, 1454893810, 1455228853,
1455586503, 1455652857, 1455747333, 1455965433, 1456053421, 1456137889,
1456482398, 1456590733, 1456839351, 1456945452, 1457099049, 1457108703,
1457478749, 1457480525, 1457562948), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), tstop = structure(c(1451960055, 1451960056,
1451962800, 1451975526, 1453118186, 1453211770, 1453242132, 1453683500,
1453859899, 1453923567, 1454161008, 1454334848, 1454668930, 1455499217,
1455552432, 1457003644, 1457445783, 1457542907, 1458598877, 1458822887,
1458941209, 1459028558, 1459083990, 1452370178, 1452392808, 1452656986,
1452768517, 1453158918, 1453472619, 1453502485, 1454185580, 1454894182,
1455229448, 1455587211, 1455653538, 1455747987, 1455965658, 1456053774,
1456138469, 1456482801, 1456591336, 1456839506, 1456945790, 1457099216,
1457109800, 1457480525, 1457480533, 1457563544), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), event.seq = c(1, 1, 1, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 23, 24)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -48L), .Names = c("id", "tstart", "tstop",
"event.seq"))
> head(df.out)
# A tibble: 6 × 4
id tstart tstop event.seq
<dbl> <dttm> <dttm> <dbl>
1 115 2016-01-05 02:14:06 2016-01-05 02:14:15 1
2 115 2016-01-05 02:14:15 2016-01-05 02:14:15 1
3 115 2016-01-05 02:14:15 2016-01-05 03:00:00 1
4 115 2016-01-05 03:00:00 2016-01-05 06:32:06 1
5 115 2016-01-18 11:52:09 2016-01-18 11:56:26 2
6 115 2016-01-19 13:50:10 2016-01-19 13:56:09 3
This gets me closer, but not quite what I want:
df.2 <- df %>%
arrange(id, tstart) %>%
mutate(tstart.ahead = lead(tstart)) %>%
mutate(tstop.behind = lag(tstop)) %>%
mutate(event.seq.1 = as.numeric(tstop == tstart.ahead), event.seq.2 = as.numeric(tstart == tstop.behind)) %>%
mutate(event.seq = pmax(event.seq.1, event.seq.2, na.rm = TRUE)) %>%
select(id, tstart, tstop, event.seq)
This is a little tricky. Since you want to reset for each id, we'll definitely need to group_by(id). Then we'll create a column indicating if each row is not a continuation of the previous row. Finally, we can use cumsum of this indicator. If it's not a continuation, 1 is added and event.seq goes up. If it is a continuation, 0 is added and event.seq stays the same. We add 1 to start at 1 not 0.
library(dplyr)
df.2 <- df %>%
arrange(id, tstart) %>%
group_by(id) %>%
mutate(not_continued = c(0, (lag(tstop) != tstart)[-1]),
event.seq = 1 + cumsum(not_continued)) %>%
select(-not_continued)
all.equal(df.2, df.out)
# [1] TRUE
Related
I have a problem when tidying a table from website scraping.
I want to get the table (with header V1 to V5) from the link below, but I failed to convert it into the same format in R studio.
This is what I'm doing
url <- "https://www.r-bloggers.com/2018/08/using-control-charts-in-r/"
library(rvest)
library(tidyverse)
h <- read_html(url)
tab <- h %>% html_nodes("table")
tab <- tab[[2]] %>% html_table()
tab <- separate_rows(tab, 1, sep = " ")
tab <- tab[8:132,]
tab <- as.data.frame(tab)
tab1 <- data.frame(c("V1", "V2", "V3", "V4", "V5"))
tab1 <- tab1 %>% setNames("Cat")
tab2 <- cbind(tab1,tab)
tab3 <- tab2 %>% spread(key = Cat, X1)
Here is the result
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 125 rows:
* 1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81, 86, 91, 96, 101, 106, 111, 116, 121
* 2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122
* 3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98, 103, 108, 113, 118, 123
* 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 99, 104, 109, 114, 119, 124
* 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125
So what should I do to get the same table as from the website?
And if you can think of a better way to get the table from this website, please tell me.
P/s: I'm learning R programming on my own, so please teach me!
Cheers.
Here's a way :
library(rvest)
url <- "https://www.r-bloggers.com/2018/08/using-control-charts-in-r/"
url %>%
read_html %>%
html_nodes('table') %>%
.[[2]] %>%
html_table() %>%
dplyr::pull(X1) %>%
stringr::str_extract_all('\\d+\\.\\d+') %>%
.[[1]] %>%
matrix(ncol = 5, byrow = TRUE) %>%
as.data.frame() %>% type.convert() -> tab
tab
# V1 V2 V3 V4 V5
#1 1.45 1.56 1.40 1.45 1.33
#2 1.75 1.53 1.55 1.42 1.42
#3 1.60 1.41 1.35 1.52 1.36
#4 1.53 1.58 1.54 1.71 1.55
#5 1.48 1.34 1.64 1.59 1.46
#6 1.69 1.55 1.49 1.61 1.47
#...
#...
I would like to modify the answer to the question here or have a new solution to include another column which shows the second largest consecutive run of "0". My sample data and code is below, the function is operating on the month columns and the second largest run column is what I hope to add. I am working with a large dataset so the more efficient the better, any ideas are appreciated, thanks.
sample data
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `Second largest run` = c(1,
NA, NA, NA, NA, 2, NA, NA, NA), result = c(2, 4, -Inf, 1,
-Inf, 5, -Inf, 3, -Inf)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
code
most_consecutive_val = function(x, val = 0) {
with(rle(x), max(lengths[values == val]))
}
test$result=apply(test[,-c(1:4,17)], MARGIN = 1, most_consecutive_val)
Rather than taking the max from the run length encoding (rle) function, we want to sort the output and then extract the desired index. We'll get NA's when we request an index that doesn't exist -- where there isn't a second run of zeroes in row 2 for example.
ordered_runs = function(x, val = 0, idx = 1) {
with(rle(x), sort(lengths[values == val], decreasing = TRUE))[idx]
}
test$result_1 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 1)
test$result_2 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 2)
Output is slightly different than your expected -- (1) using NA's rather than -Inf, and (2) in your first row, where I believe there is a tie with a second run of 2 zeroes.
> test[,c(1,17:20)]
# A tibble: 9 x 5
ID `Second largest run` result result_1 result_2
<dbl> <dbl> <dbl> <int> <int>
1 1 1 2 2 2
2 2 NA 4 4 NA
3 3 NA -Inf NA NA
4 4 NA 1 1 NA
5 5 NA -Inf NA NA
6 6 2 5 5 2
7 7 NA -Inf NA NA
8 8 NA 3 3 NA
9 9 NA -Inf NA NA
Here is an option using data.table which should be quite fast for OP's large dataset and also identifies all sequences of zeros simultaneously:
library(data.table)
setDT(DF)
cols <- c("Jan", "Feb", "March", "April", "May", "June", "July", "August", "Sep", "Oct", "Nov", "Dec")
#convert into a long format
m <- melt(DF, measure.vars=cols)[
#identify consecutive sequences of the same number and count
order(ID), c("rl", "rw") := .(rl <- rleid(ID, value), rowid(rl))][
#extract the last element where values = 0 (that is the length of sequences of zeros)
value == 0L, .(ID=ID[.N], len=rw[.N]), rl][
#sort in descending order for length of sequences
order(ID, -len)]
#pivot into wide format and perform a update join
wide <- dcast(m, ID ~ rowid(ID), value.var="len")
DF[wide, on=.(ID), (names(wide)) := mget(names(wide))]
output:
ID V1 V2 V3 Jan Feb March April May June July August Sep Oct Nov Dec 1 2
1: 1 A 21 149 10 20 0 0 25 29 34 37 0 0 48 50 2 2
2: 2 B 233 250 20 25 28 34 0 0 0 0 39 48 55 60 4 NA
3: 3 A 185 218 10 15 20 25 30 35 40 45 50 55 60 65 NA NA
4: 4 B 85 104 12 0 14 16 22 24 28 29 31 35 40 45 1 NA
5: 5 B 208 62 76 89 108 125 135 145 163 173 193 210 221 239 NA NA
6: 6 A 112 19 28 0 0 71 0 0 0 0 0 163 183 195 5 2
7: 7 A 238 175 137 152 165 181 191 205 217 228 239 252 272 289 NA NA
8: 8 B 66 168 162 177 194 208 224 244 256 276 308 0 0 0 3 NA
9: 9 B 38 28 101 119 132 149 169 187 207 221 236 247 264 277 NA NA
data:
DF <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `1` = c(2L,
4L, NA, 1L, NA, 5L, NA, 3L, NA), `2` = c(2L, NA, NA, NA,
NA, 2L, NA, NA, NA)), row.names = c(NA, -9L), class = "data.frame")
I have a large data frame, which includes impoTand nlc as key (ignore t all together), and other columns which each hold a number. I want to find, for each impoTand nlc pair, the average of all the other columns, or basically a rowMean. A subset of my data, which only includes one nlc, is given at the end. The last thing I tried was :
avg <- data.frame(a %>% group_by(impoT, nlc) %>% select(-c(1:3)) %>% mutate(r= rowMeans(.) ))
stds = (a %>% group_by(impoT, nlc) %>% select(-c(1:3)) %>% apply( 1, sd)) #wrong
dput(a)
structure(list(impoT = 1:18, nlc = c(669L, 669L, 669L, 669L,
669L, 669L, 669L, 669L, 669L, 669L, 669L, 669L, 669L, 669L, 669L,
669L, 669L, 669L), t = c(102L, 118L, 134L, 150L, 166L, 182L,
198L, 214L, 230L, 246L, 262L, 278L, 294L, 310L, 326L, 342L, 358L,
374L), X11950 = c(6, 14, 40, 53, 59, 70, 118, 119, 111, 114,
103, 220, 278, 94, 28, 13, 5, 8), X11951 = c(4, 18, 41, 64, 78,
87, 140, 112, 113, 129, 112, 245, 322, 102, 52, 20, 15, 7), X11952 = c(8,
13, 30, 42, 52, 86, 126, 118, 52, 87, 116, 251, 262, 101, 35,
21, 15, 21), X11955 = c(9, 11, 47, 38, 39, 70, 95, 82, 80, 77,
77, 142, 192, 78, 13, 13, 5, 0), X11956 = c(14, 13, 44, 65, 65,
72, 125, 138, 117, 111, 104, 175, 282, 93, 28, 14, 8, 4), X11957 = c(10,
7, 45, 42, 50, 83, 123, 102, 104, 82, 102, 234, 265, 101, 23,
13, 7, 6), X11958 = c(10, 13, 42, 60, 68, 69, 106, 125, 104,
103, 112, 233, 310, 128, 50, 22, 10, 5), X11959 = c(7, 11, 32,
45, 63, 74, 119, 87, 121, 108, 104, 229, 266, 111, 46, 26, 22,
11), X11962 = c(8, 12, 38, 35, 49, 58, 96, 66, 73, 109, 82, 161,
192, 75, 22, 4, 2, 3), X11963 = c(8, 9, 39, 40, 56, 50, 142,
98, 102, 78, 79, 220, 229, 87, 25, 5, 7, 2), X11964 = c(10, 9,
42, 60, 53, 52, 105, 114, 96, 94, 95, 180, 268, 114, 23, 10,
7, 10), X11965 = c(9, 9, 41, 40, 61, 81, 150, 102, 102, 121,
125, 222, 347, 116, 37, 18, 3, 4), X11966 = c(10, 9, 34, 43,
49, 73, 112, 123, 102, 92, 107, 207, 239, 115, 60, 18, 15, 5),
X11969 = c(8, 9, 31, 34, 41, 51, 93, 92, 68, 103, 76, 166,
182, 63, 24, 14, 6, 4), X11970 = c(7, 12, 33, 48, 56, 59,
102, 88, 99, 86, 103, 194, 233, 90, 25, 13, 7, 3), X11971 = c(9,
16, 37, 60, 78, 62, 114, 106, 129, 107, 91, 212, 272, 88,
31, 10, 3, 3), X12088 = c(6, 11, 41, 44, 56, 70, 106, 97,
64, 73, 75, 161, 186, 76, 17, 8, 2, 2), X12089 = c(0, 11,
53, 59, 62, 64, 114, 109, 109, 100, 66, 222, 241, 88, 19,
8, 8, 3), X12090 = c(4, 12, 57, 52, 65, 73, 132, 109, 120,
101, 104, 227, 238, 99, 17, 8, 10, 8), X12091 = c(4, 16,
54, 167, 74, 62, 111, 95, 120, 102, 92, 227, 317, 106, 44,
16, 10, 4), X12092 = c(9, 10, 50, 55, 63, 64, 130, 103, 98,
116, 83, 249, 279, 88, 35, 36, 22, 15), X12095 = c(5, 15,
39, 44, 53, 58, 95, 92, 67, 63, 69, 163, 182, 69, 20, 8,
4, 2), X12096 = c(3, 14, 49, 53, 71, 70, 107, 130, 90, 89,
101, 214, 253, 100, 30, 10, 3, 3), X12097 = c(2, 16, 53,
61, 82, 83, 123, 124, 125, 98, 89, 220, 274, 107, 20, 17,
7, 5), X12098 = c(6, 17, 56, 59, 51, 77, 102, 115, 93, 98,
83, 221, 288, 97, 36, 16, 9, 10), X12099 = c(2, 16, 39, 49,
60, 84, 112, 91, 102, 103, 108, 246, 261, 131, 49, 24, 18,
14), X12102 = c(4, 12, 29, 47, 64, 69, 104, 111, 92, 72,
105, 174, 179, 64, 16, 10, 2, 1)), .Names = c("impoT", "nlc",
"t", "X11950", "X11951", "X11952", "X11955", "X11956", "X11957",
"X11958", "X11959", "X11962", "X11963", "X11964", "X11965", "X11966",
"X11969", "X11970", "X11971", "X12088", "X12089", "X12090", "X12091",
"X12092", "X12095", "X12096", "X12097", "X12098", "X12099", "X12102"
), row.names = c(NA, -18L), class = "data.frame")
It's easiest if you split your means into two steps, as you're actually taking the mean of irregular groups: first each row, and second each group. This means you're taking the means of means, but given each of the row means is of the same amount of numbers, they should be fine that way, although you should consider that the grouping means may be means of different amounts of rows.
You also need to nest the select so you don't lose your grouping variables, and use summarise to collapse the groups. All told,
a %>% mutate(r = rowMeans(select(a, -c(1:3)))) %>%
group_by(impoT, nlc) %>% summarise(r = mean(r))
produces
Source: local data frame [18 x 3]
Groups: impoT [?]
impoT nlc r
(int) (int) (dbl)
1 1 669 6.740741
2 2 669 12.407407
3 3 669 42.074074
4 4 669 54.037037
5 5 669 59.925926
6 6 669 69.296296
7 7 669 114.888889
8 8 669 105.481481
9 9 669 98.259259
10 10 669 96.888889
11 11 669 94.925926
12 12 669 207.962963
13 13 669 253.222222
14 14 669 95.592593
15 15 669 30.555556
16 16 669 14.629630
17 17 669 8.592593
18 18 669 6.037037
I have a Data Table with three columns: seller, product and price.
Example data:
seller product price
1: A banana 56
2: A lemon 94
3: A orange 84
4: A banana 11
5: A lemon 86
---
166: C orange 162
167: C banana 109
168: C orange 61
169: C banana 141
170: C orange 22
Code for the data
require (data.table)
DT <- data.table(seller = c(rep(c("A"),60),rep(c("B"),62),rep(c("C"),48)), product = c(rep(c("banana", "lemon", "orange"), 20), rep(c("banana", "lemon"), 31), rep(c("banana", "orange"), 24)),
price = c(56, 94, 84, 11, 86, 103, 151, 51, 117, 71, 63, 101, 45, 147, 135, 93, 26, 164, 90, 67, 12, 34, 14, 131, 92, 145, 48, 74, 62, 57, 20, 80, 113, 46, 88, 102, 134, 98, 137, 123, 169, 133, 146,
160, 58, 42, 52, 158, 170, 2, 152, 10, 130, 30, 33, 144, 73, 41, 139, 107, 163, 9, 66, 81, 79, 127, 40, 165, 106, 161, 16, 1, 112, 70, 115, 138, 76, 105, 17, 118, 114, 121, 25, 39, 15, 155, 50, 166,
100, 159, 5, 19, 29, 24, 64, 149, 120, 35, 119, 53, 21, 7, 72, 132, 154, 168, 156, 38, 3, 148, 69, 44, 6, 28, 140, 77, 104, 153, 59, 142, 116, 150, 97, 31, 91, 43, 47, 27, 143, 99, 37, 54, 49, 4, 111,
32, 23, 85, 167, 136, 78, 129, 83, 124, 36, 96, 110, 13, 65, 108, 8, 18, 157, 87, 82, 60, 122, 89, 125, 68, 75, 126, 128, 55, 95, 162, 109, 61, 141, 22))
I would like to perform a pairwise T.test combination between all sellers that sell the same products.
I would like to have an output as it is shown (hypotetical p.values for the example).
Desire output:
seller.x seller.y product p.value
A B banana 0.45
A B lemon 0.87
B C banana 0.03
A C banana 0.23
A C orange 0.01
You first need to group by product. Then, in your j parameter, you need to compute the combinations of seller for this product and get the p.value for the t.test of price between seller.x and seller.y:
DT[
, {
sellercomb <- data.table(t(combn(unique(seller), 2)))
names(sellercomb) <- c("seller.x", "seller.y")
sellercomb[
, {
data.table(p.value = t.test(price[seller == seller.x], price[seller == seller.y])$p.value)
}
, by = .(seller.x, seller.y)
]
}
, by = .(product)
]
The result for your data above looks like this:
product seller.x seller.y p.value
1: banana A B 0.9384329
2: banana A C 0.2413946
3: banana B C 0.2154216
4: lemon A B 0.7282811
5: orange A C 0.0354320
I'm new to R and practicing using the Titanic data set from Kaggle. I am attempting to separate last name, first name, salutation, and extra information into separate columns so that I can try to categorize the age of the passengers - adult or child.
The following is sample data from the Train data set:
head(traindf,5)
# Source: local data frame [5 x 12]
#
# PassengerId Survived Pclass
# 1 1 0 3
# 2 2 1 1
# 3 3 1 3
# 4 4 1 1
# 5 5 0 3
# Variables not shown: Name (chr), Sex (fctr), Age (dbl), SibSp (int), Parch
# (int), Ticket (fctr), Fare (dbl), Cabin (fctr), Embarked (fctr)
The following is a sample that includes the Name:
select(traindf,Survived,Pclass,Name,Sex)
# Source: local data frame [891 x 4]
#
# Survived Pclass Name Sex
# 1 0 3 Braund, Mr. Owen Harris male
# 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
# 3 1 3 Heikkinen, Miss. Laina female
# 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
# 5 0 3 Allen, Mr. William Henry male
# 6 0 3 Moran, Mr. James male
# 7 0 1 McCarthy, Mr. Timothy J male
# 8 0 3 Palsson, Master. Gosta Leonard male
# 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
# 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female
I can use the following code to separate last name from the rest of the column:
require(tidyr) # for the separate() function
traindfnames <- traindf %>%
separate(Name, c("Lastname","Salutation"), sep = ",")
traindfnames
# Source: local data frame [891 x 13]
#
# PassengerId Survived Pclass Lastname
# 1 1 0 3 Braund
# 2 2 1 1 Cumings
# 3 3 1 3 Heikkinen
# 4 4 1 1 Futrelle
# 5 5 0 3 Allen
# 6 6 0 3 Moran
# 7 7 0 1 McCarthy
# 8 8 0 3 Palsson
# 9 9 1 3 Johnson
# 10 10 1 2 Nasser
# .. ... ... ... ...
# Variables not shown: Salutation (chr), Sex (fctr), Age (dbl), SibSp (int),
# Parch (int), Ticket (fctr), Fare (dbl), Cabin (fctr), Embarked (fctr)
However, when I try to add a field for First Name:
traindfnames <- traindf %>%
separate(Name, c("Lastname","Salutation","firstname"), sep =",,")
I get this error:
# Error: Values not split into 3 pieces at 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 2
Am I using incorrect syntax or 3 fields from one column isn't possible?
Having looked at this data, I think the easiest way to do it is using something like str_match() from package stringr. If you assume data$Name is in the form
"[Lastname], [Salutation]. [Firstname]"
the regular expression to match this is
str_match(data$Name, "([A-Za-z]*),\\s([A-Za-z]*)\\.\\s(.*)")
# [,1] [,2] [,3] [,4]
# [1,] "Braund, Mr. Owen Harris" "Braund" "Mr" "Owen Harris"
# [2,] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Cumings" "Mrs" "John Bradley (Florence Briggs Thayer)"
# [3,] "Heikkinen, Miss. Laina" "Heikkinen" "Miss" "Laina"
# [4,] "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "Futrelle" "Mrs" "Jacques Heath (Lily May Peel)"
# [5,] "Allen, Mr. William Henry" "Allen" "Mr" "William Henry"
# [6,] "Moran, Mr. James" "Moran" "Mr" "James"
So you need to add columns 2 to 4 above to your original data frame. I am not sure you can do this with separate actually. Writing
separate(data, Name, c("Lastname", "Salutation", "Firstname"), sep = "[,\\.]")
will try to split each entry by comma or dot, but it runs into a problem in the 514th entry that looks like "Rothschild, Mrs. Martin (Elizabeth L. Barrett)" (notice the second dot).
In short, the easiest way I can see of doing what you want is
data[c("Firstname", "Salutation", "Lastname")] <-
str_match(data$Name, "([A-Za-z]*),\\s([A-Za-z]*)\\.\\s(.*)")[, 2:4]