Operate with a table in R - r

I want to get a tuple from a table in R.
For example, if i have this table named batch_task:
arrival_time, departure_time, jod_id, task_id
11792, 11999, 18, 88
11792, 14331, 18, 82
11792, 12112, 18, 91
16281, 16552, 27, 147
16281, 16396, 27, 139
16281, 16529, 27, 137
So, for each job_id i need a tuple {arrival_time, service_time}; for example, for the job_id = 18, i want to get the tuple {11792, (11999-11792)+(14331-11792)+(12112-11792)} = {11792, 3066}.
Anyone could help me? Thanks in advance,
Jesús

Assuming that batch_task is actually a data frame rather than a table (which is a specific type of R structure dealing with count data), here's a solution using the dplyr package:
library(dplyr)
batch_task %>%
group_by(arrival_time) %>%
summarise(service_time = sum(departure_time - arrival_time), .groups = "drop")
#> # A tibble: 2 x 2
#> arrival_time service_time
#> <int> <int>
#> 1 11792 3066
#> 2 16281 634
Note that there is no data structure called a tuple in R. There are various ways to represent tuples, but the most sensible way in this case would be to keep the result in a data frame format.
Data
batch_task <- structure(list(arrival_time = c(11792L, 11792L, 11792L, 16281L,
16281L, 16281L), departure_time = c(11999L, 14331L, 12112L, 16552L,
16396L, 16529L), jod_id = c(18L, 18L, 18L, 27L, 27L, 27L), task_id = c(88L,
82L, 91L, 147L, 139L, 137L)), class = "data.frame", row.names = c(NA,
-6L))

With the data
batch_task <- structure(list(arrival_time = c(11792L, 11792L, 11792L, 16281L,
16281L, 16281L),
departure_time = c(11999L, 14331L, 12112L, 16552L,
16396L, 16529L),
jod_id = c(18L, 18L, 18L, 27L, 27L, 27L),
task_id = c(88L,
82L, 91L, 147L, 139L, 137L)),
class = "data.frame", row.names = c(NA,
-6L))
I propose this solution
library(tidyverse)
batch_task %>%
mutate(service_time = departure_time - arrival_time) %>%
group_by(jod_id, arrival_time) %>% #assuming arrival_time is equal for each job_id
summarise(service_time = sum(service_time))

Using by.
do.call(rbind, by(d, d$jod_id, function(x) {
a <- x$arrival_time[1]
c(a, sum(x$departure_time - a))
}))
# [,1] [,2]
# 18 11792 3066
# 27 16281 634
Data:
d <- structure(list(arrival_time = c(11792L, 11792L, 11792L, 16281L,
16281L, 16281L), departure_time = c(11999L, 14331L, 12112L, 16552L,
16396L, 16529L), jod_id = c(18L, 18L, 18L, 27L, 27L, 27L), task_id = c(88L,
82L, 91L, 147L, 139L, 137L)), class = "data.frame", row.names = c(NA,
-6L))

Related

Boxplot labelling outliers returns an error using data rownames

I am trying to label the outliers in my boxplot using the text function so I can find out from which class the outliers are coming from. I've stored the rownames of my data in variable "rownames" using names(vehData) to get the row names. When I apply this however, I get an error.
ERROR: Error in which(removeOutliers1 == bxpdat$out, arr.ind = TRUE) :
'list' object cannot be coerced to type 'double'
Completely new to R programming. Completely not sure how to fix this or what I am doing wrong
Thanks in advance for any help!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
rownames <- names(vehData) #column names
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
bxpdat <- boxplot(removeOutliers1)
#Also tried using vehicles$Class instead of rownames but get the same error
text(bxpdat$group, bxpdat$out,
rownames[which(removeOutliers1 == bxpdat$out, arr.ind = TRUE)[,1]],
pos = 4)
The boxplot looks like this. I am trying to label the outliers based on the x axis e.g. "Comp", "Circ", "D.Circ", "Rad.Ra", "Max.L.Ra" etc.. & by vehicle class "Van", "Bus" ..
Crammed text issue when identifying class
If it is the outliers in the 2nd boxplot, it would be:
bxpdat <- boxplot(removeOutliers1)
text(bxpdat$group, bxpdat$out,
bxpdat$names[bxpdat$group],
pos = 4)
Maybe looks better like this, if you adjust the margin and flip the labels:
par(mar=c(8,3.5,3.5,3.5))
bxpdat = boxplot(removeOutliers1,las=2,cex=0.5)
text(bxpdat$group, bxpdat$out,
bxpdat$names[bxpdat$group],
pos = 4,cex=0.5)
I understood the question differently to #StupidWolf. I thought the goal was to replace points indicating outliers with the text of the vehicle class (bus, van or saab). If you simply print the variable name (e.g. Skew.maxis), then you might as well have simply plotted the outliers as points. Unless I'm missing something.
Here is code to answer the question as I understood it, for what it's worth (beginning after defining removeOutliers):
# CHANGE: Create vehClass vector before removing Class from the dataframe
vehClass <- vehData$Class
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
bxpdat <- boxplot(removeOutliers1) # use boxplot(vehData) if you plot all the outliers as points
# loop over columns
n_plot <- 1; set.seed(123) # only plot n_plot randomly-chosen outliers
for(i in 1:ncol(vehData)){
# find out which row indices were removed as outliers
diffInd <- which(vehData[[i]] %in% setdiff(vehData[[i]], removeOutliers1[[i]]))
# if none were, then don't add any outlier text
if(length(diffInd) == 0) next
print(i)
print(paste0("l:", length(diffInd)))
if(length(diffInd) > n_plot){
diffIndPlot <- sample(diffInd, n_plot, replace = FALSE)
} else diffIndPlot <- diffInd
text(x = i, y = vehData[[i]][diffIndPlot],
labels = paste0(vehClass[diffIndPlot], ": ", vehData[[i]][diffIndPlot]))
}

Fuzzy Join Error: All columns in a tibble must be vectors

test <- structure(list(trip_count = 1:10, dropoff_longitude = c(-73.959862,
-73.882202, -73.934113, -73.992203, -74.00563, -73.975189, -73.97448,
-73.974838, -73.981377, -73.955093), dropoff_latitude = c(40.773617,
40.744175, 40.715923, 40.749203, 40.726158, 40.729824, 40.763599,
40.754135, 40.759987, 40.765224)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fd18800f6e0>)
> dput(zip_codes)
zip_codes <- structure(list(zipcode = c("10001", "10002", "10003", "10004",
"10005", "10006", "10007", "10009", "10010", "10011", "10012",
"10013", "10014", "10016", "10017", "10018", "10019", "10020",
"10021", "10022", "10023", "10024", "10025", "10026", "10027",
"10028", "10029", "10030", "10031", "10032", "10033", "10034",
"10035", "10036", "10037", "10038", "10039", "10040", "10044",
"10065", "10069", "10075", "10103", "10110", "10111", "10112",
"10115", "10119", "10128", "10152", "10153", "10154", "10162",
"10165", "10167", "10168", "10169", "10170", "10171", "10172",
"10173", "10174", "10177", "10199", "10271", "10278", "10279",
"10280", "10282"), bounds_north = c(40.759731, 40.724136, 40.739673,
40.709044, 40.709294, 40.71369, 40.71719, 40.734975, 40.745421,
40.756703, 40.731706, 40.727557, 40.742873, 40.752197, 40.757912,
40.762526, 40.773446, 40.761094, 40.775045, 40.764898, 40.783192,
40.818099, 40.811264, 40.807546, 40.822108, 40.782213, 40.800665,
40.824032, 40.834372, 40.850517, 40.861552, 40.87765, 40.809582,
40.765558, 40.819569, 40.714451, 40.846615, 40.866336, 40.772955,
40.770517, 40.781007, 40.777677, 40.761771, 40.755516, 40.759689,
40.759899, 40.811331, 40.751522, 40.787914, 40.759059, 40.764279,
40.758432, 40.770085, 40.752801, 40.755303, 40.752119, 40.754974,
40.753811, 40.756556, 40.755928, 40.754783, 40.752116, 40.7556,
40.752723, 40.708797, 40.71628, 40.713256, 40.714767, 40.719611
), bounds_south = c(40.743451, 40.708802, 40.722933, 40.683919,
40.702879, 40.705871, 40.709806, 40.718612, 40.73231, 40.731043,
40.719867, 40.713446, 40.72428, 40.73801, 40.747251, 40.749102,
40.758645, 40.757284, 40.758133, 40.751445, 40.768436, 40.778805,
40.788476, 40.79691, 40.803047, 40.770062, 40.782531, 40.812791,
40.817221, 40.829083, 40.842958, 40.849745, 40.781075, 40.752197,
40.806636, 40.701689, 40.817912, 40.851863, 40.749415, 40.759284,
40.771612, 40.769441, 40.759787, 40.753481, 40.758538, 40.758436,
40.810373, 40.749101, 40.773108, 40.757749, 40.762964, 40.757125,
40.768355, 40.75146, 40.753994, 40.750775, 40.753811, 40.751441,
40.755243, 40.754619, 40.753481, 40.750766, 40.754678, 40.750241,
40.707694, 40.714082, 40.711995, 40.700273, 40.713378), bounds_east = c(-73.984076,
-73.973635, -73.979864, -73.995657, -74.004569, -74.009988, -74.000455,
-73.971282, -73.971566, -73.990798, -73.991794, -73.994035, -73.999555,
-73.968192, -73.964271, -73.981822, -73.973015, -73.977201, -73.947973,
-73.958599, -73.974067, -73.960687, -73.954966, -73.944667, -73.940404,
-73.944337, -73.930891, -73.936232, -73.938588, -73.934671, -73.92216,
-73.910587, -73.914228, -73.978116, -73.933219, -73.991772, -73.929107,
-73.924385, -73.940026, -73.952085, -73.986609, -73.947039, -73.975831,
-73.980395, -73.976744, -73.97845, -73.963058, -73.99111, -73.937328,
-73.970993, -73.971411, -73.971451, -73.94827, -73.977677, -73.973735,
-73.976048, -73.975209, -73.974648, -73.97282, -73.973276, -73.978332,
-73.973959, -73.975352, -73.993948, -74.009829, -74.002115, -74.007666,
-74.013754, -74.012441), bounds_west = c(-74.008621, -73.997532,
-73.999604, -74.047285, -74.012508, -74.015905, -74.013754, -73.988643,
-73.994028, -74.012359, -74.004575, -74.016381, -74.01599, -73.987746,
-73.981822, -74.007989, -74.003477, -73.98373, -73.968441, -73.977655,
-73.990149, -73.98814, -73.977092, -73.962475, -73.9659, -73.96323,
-73.955778, -73.948677, -73.960007, -73.950403, -73.944672, -73.947051,
-73.946462, -74.001702, -73.943398, -74.010542, -73.943506, -73.938947,
-73.961583, -73.972553, -73.996142, -73.965148, -73.979513, -73.984118,
-73.97845, -73.980886, -73.964424, -73.994844, -73.959921, -73.973068,
-73.973465, -73.973524, -73.951858, -73.979768, -73.975807, -73.978159,
-73.976974, -73.977107, -73.974897, -73.975352, -73.980395, -73.976048,
-73.976516, -74.00143, -74.011248, -74.00542, -74.009668, -74.019603,
-74.01831), zip = c(10001, 10002, 10003, 10004, 10005, 10006,
10007, 10009, 10010, 10011, 10012, 10013, 10014, 10016, 10017,
10018, 10019, 10020, 10021, 10022, 10023, 10024, 10025, 10026,
10027, 10028, 10029, 10030, 10031, 10032, 10033, 10034, 10035,
10036, 10037, 10038, 10039, 10040, 10044, 10065, 10069, 10075,
10103, 10110, 10111, 10112, 10115, 10119, 10128, 10152, 10153,
10154, 10162, 10165, 10167, 10168, 10169, 10170, 10171, 10172,
10173, 10174, 10177, 10199, 10271, 10278, 10279, 10280, 10282
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L,
13L, 14L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L,
40L, 43L, 50L, 51L, 53L, 67L, 74L, 75L, 76L, 79L, 83L, 91L, 101L,
102L, 103L, 111L, 114L, 116L, 117L, 118L, 119L, 120L, 121L, 122L,
123L, 126L, 133L, 151L, 158L, 159L, 160L, 162L), class = "data.frame")
Hey guys, so I am trying to fuzzy-join lat & lon information to get the zip code of a specific location. I tried:
test <- test %>% fuzzy_left_join(zip_codes,by = c("dropoff_longitude" = "bounds_east", "dropoff_longitude" = "bounds_west", "dropoff_latitude" = "bounds_north","dropoff_latitude" = "bounds_south"), match_fun = list('<=', '>=' , '<=', '>='))
But unfortunately, this returns the error message Error: All columns in a tibble must be vectors. x Column "col" is NULL.
I don't know how to solve this. There is no column "col" in either one of the data frames. The result should give me the correspondent zip code if the dropoff_longitute is between bounds_east and bounds_west and the dropoff_latitude is between bounds_north and bounds_south.
Thanks a lot in advance!
We could use the non-equi join from data.table as one of the dataset is data.table
library(data.table)
setDT(test)[zip_codes, on = .(dropoff_longitude <= bounds_east,
dropoff_longitude >= bounds_west,
dropoff_latitude <= bounds_north,
dropoff_latitude >= bounds_south)]

Plotting results with missing categories in interaction with emmeans

I have a quite "messy data". I have a model with a interaction between two factors. And I want to plot it. So:
f1 <- structure(list(tipo = c("digitables", "digitables", "digitables",
"digitables", "digitables", "digitables", "digitables", "digitables",
"payments", "payments", "payments", "payments", "payments", "payments",
"payments", "payments", "traditionals", "traditionals", "traditionals",
"traditionals", "traditionals", "traditionals", "traditionals",
"traditionals"), categoria = c("Advice", "Digital banks", "Exchange",
"FinTech", "Insurance", "Investments", "Lending", "Payments and transfers",
"Advice", "Digital banks", "Exchange", "FinTech", "Insurance",
"Investments", "Lending", "Payments and transfers", "Advice",
"Digital banks", "Exchange", "FinTech", "Insurance", "Investments",
"Lending", "Payments and transfers"), Total = c(63L, 450L, 279L,
63L, 36L, 108L, 567L, 549L, 63L, 450L, 279L, 63L, 36L, 108L,
567L, 549L, 35L, 250L, 155L, 35L, 20L, 60L, 315L, 305L), Frequencia = c(44L,
266L, 118L, 9L, 14L, 45L, 134L, 242L, 33L, 68L, 2L, 10L, 3L,
8L, 11L, 78L, 27L, 226L, 142L, 10L, 20L, 45L, 300L, 245L), Perc = c(69.84,
59.11, 42.29, 14.29, 38.89, 41.67, 23.63, 44.08, 52.38, 15.11,
0.72, 15.87, 8.33, 7.41, 1.94, 14.21, 77.14, 90.4, 91.61, 28.57,
100, 75, 95.24, 80.33), Failure = c(19L, 184L, 161L, 54L, 22L,
63L, 433L, 307L, 30L, 382L, 277L, 53L, 33L, 100L, 556L, 471L,
8L, 24L, 13L, 25L, 0L, 15L, 15L, 60L)), row.names = c(NA, -24L
), class = "data.frame")
# Packages
library(dplyr)
library(ggplot2)
library(emmeans) #version 1.4.8. or 1.5.1
# Works as expected
m1 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f1, family = binomial(link = "logit"))
l1 <- emmeans(m1, ~categoria|tipo)
plot(l1, type = "response",
comparison = T,
by = "categoria")
Using by="tipo" results:
# Doesn't work:
plot(l1, type = "response",
comparison = T,
by = "tipo")
Error: Aborted -- Some comparison arrows have negative length!
In addition: Warning message:
Comparison discrepancy in group digitables, Advice - Insurance:
Target overlap = -0.0241, overlap on graph = 0.0073
If I use comparison = F as suggested by explanation supplement vignette, it works. However, it does not show me the arrows, which are very important.
Q1 - Is there a work around for it? (Or is it impossible due to my data?)
As we can see from the last plot, there is a category with probability = 1 (categoria=Insurance and tipo=traditionals). So, I delete only this row of my data frame, and I try to redo the plotting, and results to me:
f1 <- f1 %>%
filter(!Perc ==100)
m1 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f1, family = binomial(link = "logit"))
l1 <- emmeans(m1, ~categoria|tipo)
plot(l1, type = "response",
comparison = T,
by = "categoria")
Error in if (dif[i] > 0) lmat[i, id1[i]] = rmat[i, id2[i]] = wgt * v1[i] else rmat[i, :
missing value where TRUE/FALSE needed
Q2 - How to plot my results even when I have a missing level of one variable (with respect to another variable?). I would expect that the Insurance facet would have only have the payments and digitables levels (while the others remain the same).
First, please don't ever re-use the same variable names for more than one thing; that makes things not reproducible. If you modify a dataset, or a model, or whatever, give it a new name so it can be distinguished.
Q1
As documented, comparison arrows cannot always be computed. This is such an example. I suggest displaying the results some other way, e.g. using pwpp() or pwpm()
Q2
There was a bug in handling missing cases. This has been fixed in the GitHub version:
f2 <- f1 %>%
filter(!Perc ==100)
m2 <- glm(cbind(Frequencia, Failure) ~ tipo*categoria,
data = f2, family = binomial(link = "logit"))
l2 <- emmeans(m2, ~categoria|tipo)
plot(l2, type = "response",
comparison = TRUE,
by = "categoria")
plot(l2, type = "response",
comparison = TRUE,
by = "tipo")
## Error: Aborted -- Some comparison arrows have negative length!
## (in group "payments")

Assign trip number based on condition

I have a time series data. I would like to group and number rows when column "soak" > 3600. The first row when soak > 3600 is numbered as 1, and the consecutive rows are numbered as 1 too until another row met the condition of soak > 3600. Then that row and consequent rows are numbered as 2 until the third occurrence of soak > 3600.
A small sample of my data and the code I tried is also provided.
My code did the count, but seems using the ave() gave me some decimal numbers... Is there a way to output integer?
starts <- structure(list(datetime = structure(c(1440578907, 1440579205,
1440579832, 1440579885, 1440579926, 1440579977, 1440580044, 1440580106,
1440580195, 1440580256, 1440580366, 1440580410, 1440580476, 1440580529,
1440580931, 1440580966, 1440587753, 1440587913, 1440587933, 1440587954
), class = c("POSIXct", "POSIXt"), tzone = ""), soak = c(NA,
70L, 578L, 21L, 2L, 41L, 14L, 16L, 32L, 9L, 45L, 20L, 51L, 25L,
364L, 4L, 6764L, 20L, 4L, 5L)), row.names = c(NA, -20L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000000000a4d1ef0>)
starts$trip <- with(starts, ave(tdiff, cumsum(replace(soka, NA, 10000) > 3600)))
Using dplyr
library(dplyr)
starts %>% mutate(trip = cumsum(replace(soak, is.na(soak), 1) > 3600))
And with base R
starts$trip = with(starts, ave(soak, FUN=function(x) cumsum(replace(x, is.na(x), 1) > 3600)))

Lookup values corresponding to the closest date

I have a data.frame x with date and Value
x = structure(list(date = structure(c(1376534700, 1411930800, 1461707400,
1478814300, 1467522000, 1451088000, 1449956100, 1414214400, 1472585400,
1418103000, 1466176500, 1434035100, 1442466300, 1410632100, 1448571900,
1439276400, 1468382700, 1476137400, 1413177300, 1438881300), class = c("POSIXct",
"POSIXt"), tzone = ""), Value = c(44L, 49L, 31L, 99L, 79L, 92L,
10L, 72L, 60L, 41L, 28L, 21L, 67L, 61L, 8L, 65L, 40L, 48L, 53L,
90L)), .Names = c("date", "Value"), row.names = c(NA, -20L), class = "data.frame")
and another list y with only date
y = structure(c(1470356820, 1440168960, 1379245020, 1441582800, 1381753740
), class = c("POSIXct", "POSIXt"), tzone = "")
Before I try to do it with a loop, I wanted to find out if there is a quick way (or packages) to lookup Value from the closest date in x for dates in y? The goal is to find out a date in x that is closest to the date in y and obtain the corresponding Value.
The desired output (got from Excel VLOOKUP, so may not be perfect) would be something like:
output = structure(list(y = structure(c(1470356820, 1440168960, 1379245020,
1441582800, 1381753740), class = c("POSIXct", "POSIXt"), tzone = ""),
Value = c(40, 65, 44, 65, 44)), .Names = c("y", "Value"), row.names = c(NA,
-5L), class = "data.frame")
sapply(y, function(z) x$Value[which.min(abs(x$date - z))])
# [1] 40 65 44 67 44
Using data.table you can join to the nearest value
library(data.table)
x <- as.data.table(x)
y <- data.table(date=y)
res <- x[y, on='date', roll='nearest']

Resources