Getting cartesian product after mapping from another dataframe - r

I have two datasets called mydata and pairs respectively,
I want to get all combinations of id column from mydata with respect to the pairs by matching Value column in mydata. One way to achieve this is ,
lapply(1:nrow(pairs),function(ind) {expand.grid(mydata[mydata[,2]==pairs[ind,"a"],1],
mydata[mydata[,2]==pairs[ind,"b"],1])
} )
However I feel there must exist more efficient alternatives. Maybe some sort of mapping?
(I am not asking for efficiency like expand.grid vs data.table::CJ)
Thank you in advance.
Data:
mydata <- structure(list(id = 1:40, Value = c(230, 27, 161, 19, 40, 157,
237, 237, 237, 61, 40, 27, 237, 230, 237, 157, 27, 157, 230,
157, 19, 161, 61, 27, 61, 61, 237, 157, 27, 40, 27, 40, 40, 61,
157, 61, 157, 40, 237, 19)), class = "data.frame", row.names = c(NA,
-40L))
pairs <- structure(list(a = c(40, 61, 157), b = c(161, 237, 27)), class = "data.frame", row.names = c(NA,
-3L))

One possibility is to use data.table to get the vectors being passed to expand.grid all in one go:
library(data.table)
dtPairs <- data.table(Value = unlist(pairs), expand.grid(ind1 = 1:nrow(pairs), ind2 = 1:ncol(pairs)))
dt <- as.data.table(mydata)[Value %in% dtPairs$Value, list(list(id)), by = Value]
dt[dtPairs, `:=`(ind1 = i.ind1, ind2 = i.ind2), on = "Value"]
setorder(dt, ind1, ind2)
ans2 <- lapply(dt[, list(list(V1)), by = ind1]$V1, expand.grid)
This involves a lot of overhead, so there won't be performance gains until the dataset gets much larger.
fwithDT <- function(data, pairs) {
dtPairs <- data.table(Value = unlist(pairs), expand.grid(ind1 = 1:nrow(pairs), ind2 = 1:ncol(pairs)))
return(lapply(setorder(as.data.table(data)[Value %in% dtPairs$Value, list(list(id)), by = Value][dtPairs, `:=`(ind1 = i.ind1, ind2 = i.ind2), on = "Value"], ind1, ind2)[, list(list(V1)), by = ind1]$V1, expand.grid))
}
fOP <- function(data, pairs) {
return(lapply(1:nrow(pairs),function(ind) {expand.grid(data[data[,2]==pairs[ind,"a"],1],data[data[,2]==pairs[ind,"b"],1])}))
}
mydata2 <- mydata[rep(seq_len(nrow(mydata)), 1001), ]
mydata2[, "Value"] <- mydata2[, "Value"] + rep(seq(0, 1e6, 1e3), each = nrow(mydata))
pairs2 <- pairs[rep(seq_len(nrow(pairs)), 1001), ] + rep(seq(0, 1e6, 1e3), each = 3)
> microbenchmark(ans1 <- fOP(mydata, pairs), ans2 <- fwithDT(mydata, pairs))
Unit: microseconds
expr min lq mean median uq max neval
ans1 <- fOP(mydata, pairs) 437.3 479.55 599.969 521.75 574.4 5875.2 100
ans2 <- fwithDT(mydata, pairs) 4537.5 4824.90 5187.012 5039.95 5301.1 13231.2 100
> microbenchmark(ans1 <- fOP(mydata2, pairs2), ans2 <- fwithDT(mydata2, pairs2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
ans1 <- fOP(mydata2, pairs2) 1640.9651 1727.1139 1755.6726 1742.130 1769.235 1958.2109 10
ans2 <- fwithDT(mydata2, pairs2) 306.3268 318.5617 334.1244 326.865 345.444 376.8728 10
> identical(ans1, ans2)
[1] TRUE

Related

SVD function in R. I want to get the singular values $d from a list of datasets. I want to put it in a table form

I want to use the svd function to get the singular values of a large datasets in a list.
When I use the svd function in a single matrix, I am able to use $d and get the values, but for the list I cannot get the output.
Here is the code for a matrix and the output.
tb = matrix(c(64, 112, 59, 174, 111, 37,
39, 135, 115, 92, 161, 70,
93, 119, 50, 142, 20, 114,
149, 191, 62, 17, 145, 21,
60, 37, 29, 74, 42, 242), nrow = 5, ncol = 6, byrow = TRUE)
## Compute SVD of tb
#
my_svd = svd(tb)
## Retrieve (save) singular values
#
sv = my_svd$d
## Compute ratio between "1st/2nd" & "2nd/3rd" singular values
#
ratios = matrix(c(sv[1]/sv[2], sv[2]/sv[3]), nrow = 1)
colnames(ratios) = c("sv1/sv2", "sv2/sv3")
## Print ratios
ratios
How do I apply this to the list of dataset?
my current code
svdresult <- lapply(d1,svd)
svdresult
d1 is my list of dataset
How do I get svdresult$d on the list of datasets.
Thanks in advance
Maybe something like the following?
get_svd_ratios <- function(data) {
sv = svd(data)$d
n = length(sv)
ratios = matrix(sv[1:(n - 1)] / sv[2:n] , nrow = 1)
names = paste(
paste0("sv", 1:(n - 1)),
paste0("sv", 2:n),
sep = "/"
)
colnames(ratios) = names
return(ratios)
}
lapply(list(tb), get_svd_ratios)
# [[1]]
# sv1/sv2 sv2/sv3 sv3/sv4 sv4/sv5
# [1,] 2.261771 1.680403 1.29854 2.682195

Summing ranks for variable with fewest entries

I am learning R and want to manually compute the Mann-Whitney U statistic and p-value using a normal approximation (and not use wilcox.test or equivalent). My pensioner's brain struggles with coding so it has taken me hours to produce the same answers as the textbook. However, my code to sum the 'StateRank' for the state with the fewest values is convoluted. How can I replace the commented section with more efficient code? I've hunted high and low, both here and on Google, but I don't even know which search terms to use! It won't surprise me to hear that there is a one-line solution but I'm no nearer knowing what it is.
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average"))
# clumsy code to sort, then sum ranks (StateRank) for group with fewest values (nA)
#--------------------------------------------------------------------------------
asc_or_desc <- as.matrix(count(a.df, State))
if (as.numeric(asc_or_desc[1,2])>as.numeric(asc_or_desc[2,2])) {
a.df <- arrange(a.df, desc(State))
} else {
a.df <- arrange(a.df, State)
}
#--------------------------------------------------------------------------------
nA <- as.numeric(min(count(a.df, State, sort = TRUE)$n))
nB <- as.numeric(max(count(a.df, State, sort = TRUE)$n))
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)
Please try this code and tell me if I am on the right way:
I replaced your so called clumsy code with this one
... %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
The whole code:
library(tidyverse)
# Activity 9: aboriginal village size in Alaska and California
a.df <- data.frame(
Alaska = c(23, 26, 30, 33, 42, 45, 45, 50, 50.5, 96, 113, 557, NA),
Calif = c(39, 48, 53.5, 55, 57, 66, 77, 79, 108, 121, 162, 197, 309)
) %>%
pivot_longer(
cols = c("Alaska", "Calif"),
names_to = "State",
values_to = "Value",
values_drop_na = TRUE
) %>%
mutate(StateRank = rank(Value, ties.method = "average")) %>%
group_by(State) %>%
mutate(mx = max(Value)) %>%
arrange(desc(mx), desc(Value)) %>%
select(-mx)
-----------------------------------------------------------------------------
a.U <- sum(a.df$StateRank[1:nA])
a.E <- (nA*(nA+nB+1))/2 # Expectation of U
a.V <- (nA*nB*(nA+nB+1))/12 # Variance of U
a.Z <- (a.U - a.E)/sqrt(a.V)
a.P <- round((1 - round(pnorm(round(abs(a.Z), 2),
mean = 0, sd = 1) ,4)) * 2, 3)
# all the rounding is to mimic statistical tables (so that
# the answer is the same as in the textbook that I use)

How do I draw a plotly boxplot with calculated values?

I have the following code for creating a boxplot in ggplot2:
throughput <- c(1, 2, 3, 4, 5)
response_time_min <- c(9, 19, 29, 39, 49)
response_time_10 <- c(50, 55, 60, 60, 61)
response_time_med <- c(100, 100, 100, 100, 100)
response_time_90 <- c(201, 201, 250, 200, 230)
response_time_max <- c(401, 414, 309, 402, 311)
df <- data.frame(throughput, response_time_min, response_time_10, response_time_med,response_time_90, response_time_max)
df
library(ggplot2)
g <- ggplot(df) +
geom_boxplot(aes(x=factor(throughput),ymax = response_time_max,upper = response_time_90,
y = response_time_med,
middle = response_time_med,
lower = response_time_10,
ymin = response_time_min), stat = "identity")
g
But now when I want to apply ggplotly(g) the graph does not render correctly. What can I do to make this work?
I don't think 90th percentile and 10th percentile can be done. Assuming they are q3 and q1, respectively, the code below
bp <- plot_ly(color=c("orange")) %>%
add_trace(lowerfence = response_time_min, q1 = response_time_10,
median = response_time_med, q3 = response_time_90,
upperfence = response_time_max, type = "box") %>%
layout(xaxis=list(title="throughput"),
yaxis=list(title="response_time"))
bp
gives the following output:

Subsetting and plotting data by TimeStamp

I have a data.frame P1 (5000rows x 4cols) and would like to save the subset of data in columns 2,3 and 4 when the time-stamp in column 1 falls into a set range determined by a vector TimeStamp (in seconds).
E.g. put all values in columns 2, 3, and 4 into a new data.frame and call each section of data: Condition.1.P1, Condition.2.P1, etc.
The reason I'd like to label separately as I have 35 versions of P1 (P2, P3, P33, etc) and need to be able to melt them together to plot them.
dput(TimeStamp)
c(18, 138, 438, 678, 798, 1278, 1578, 1878, 2178)
dput(head(P1))
structure(list(Time = c(0, 5, 100, 200, 500, 1200), SkinTemp = c(27.781,
27.78, 27.779, 27.779, 27.778, 27.777), HeartRate = c(70, 70,
70, 70, 70, 70), RespirationRate = c(10, 10, 10, 10, 10, 10)), .Names = c("Time",
"SkinTemp", "HeartRate", "RespirationRate"), row.names = c(NA,
6L), class = "data.frame")
Do you want to seperate the data by the timestamp range and put it in a list? Than this might be what you are looking for:
TimeStamp <- c(18, 138, 438, 678, 798, 1278, 1578, 1878, 2178)
dat <- structure(list(Time = c(0, 5, 100, 200, 500, 1200), SkinTemp =(27.781,
27.78, 27.779, 27.779, 27.778, 27.777), HeartRate = c(70, 70,
70, 70, 70, 70), RespirationRate = c(10, 10, 10, 10, 10, 10)), .Names = c ("Time",
"SkinTemp", "HeartRate", "RespirationRate"), row.names = c(NA,
6L), class = "data.frame")
dat$Segment <- cut(dat$Time,c(-Inf,TimeStamp))
split(dat,dat$Segment)
P2 = data.frame(NA, NA, NA, NA) # Create empty data.frame
for (i in 1:length(ts)){
P3 = data.frame() # Create empty changing data.frame
if (i == 1) {ts1 = 0} else {ts1 = ts[i-1]} #First time stamp starts at 0
ts2 = ts[i]
P3 = subset(P1, P1$Time > ts1 & P1$Time < ts2)[,c(2,3,4)] #Subset the columns and assign to P3
if (nrow(P3) == 0){P3 = data.frame(NA, NA, NA)} #If the subset is empty, assign NA
P3$TimeStamp = paste(ts1,ts2,sep="-") # Append TimeStamp to the P3
colnames(P3) = colnames(P2) #Make sure column names are same to allow rbind
P2 = rbind(P2,P3) #Append P3 to P2
}
P2 = P2[c(2:nrow(P2)),] #Remove the first row (that has NA)
colnames(P2) = c("SkinTemp", "HeartRate", "RespirationRate", "TimeStamp") #Provide column names)
rm(P3); rm(i); rm(ts1); rm(ts2) #Cleanup

Compute and save the r-squared value of bootstrap objects in a new dataframe in R

I have a dataframe df
dput(df)
structure(list(x = c(49, 50, 51, 52, 53, 54, 55, 56, 1, 2, 3,
4, 5, 14, 15, 16, 17, 2, 3, 4, 5, 6, 10, 11, 3, 30, 64, 66, 67,
68, 69, 34, 35, 37, 39, 2, 17, 18, 99, 100, 102, 103, 67, 70,
72), y = c(2268.14043972082, 2147.62290922552, 2269.1387550775,
2247.31983098201, 1903.39138268307, 2174.78291538358, 2359.51909126411,
2488.39004804939, 212.851575751527, 461.398994384333, 567.150629704352,
781.775113821961, 918.303706148872, 1107.37695799186, 1160.80594193377,
1412.61328924168, 1689.48879626486, 260.737164468854, 306.72700499362,
283.410379620422, 366.813913489692, 387.570173754128, 388.602676983443,
477.858510450125, 128.198042456082, 535.519377609133, 1028.8780498564,
1098.54431357711, 1265.26965941035, 1129.58344809909, 820.922447928053,
749.343583476846, 779.678206156474, 646.575242339517, 733.953282899613,
461.156280127354, 906.813018662913, 798.186995701282, 831.365377249207,
764.519073183124, 672.076289062505, 669.879217186302, 1341.47673353751,
1401.44881976186, 1640.27575962036)), .Names = c("x", "y"), row.names = c(NA,
-45L), class = "data.frame")
I have created two non-linear regression (nls1 and nls2) based on my dataset.
library(stats)
nls1 <- nls(y~A*(x^B)*(exp(k*x)),
data = df,
start = list(A = 1000, B = 0.170, k = -0.00295))
nls2<-nls(y~A*x^3+B*x^2+C*x+D, data=df,
start = list(A=0.02, B=-0.6, C= 50, D=200))
I then computed bootstrap objects for these two functions to get multiple sets of parameters (A,B and k for nls1 and A, B, C and D for nls2).
library(nlstools)
Boo1 <- nlsBoot(nls1, niter = 200)
Boo2 <- nlsBoot(nls2, niter = 200)
Based on this bootstrap objects, I would like to compute r-squared of each combination of parameters and save the min, max and median of my r-squared values for each bootstrap object into one new dataframe. The dataframe could look like new.df.
structure(list(Median = c(NA, NA), Max = c(NA, NA), Min = c(NA,
NA)), .Names = c("Median", "Max", "Min"), row.names = c("nls1",
"nls2"), class = "data.frame")
The idea is then to do some box plots with the median, min and max values for each non-linear model based on bootstrapping to compare them. Can someone help me out with that? Thanks in advance.
Answer from #bunk
stat <- function(dat, inds) { fit <- try(nls(y~A*(x^B)*(exp(k*x)), data = dat[inds,], start = list(A = 1000, B = 0.170, k = -0.00295)), silent=TRUE); f1 <- if (inherits(fit, "nls")) AIC(fit) else NA; fit2 <- try(nls(y~A*x^3+B*x^2+C*x+D, data = dat[inds,], start = list(A=0.02, B=-0.6, C= 50, D=200)), silent=TRUE); f2 <- if (inherits(fit2, "nls")) AIC(fit2) else NA; c(f1, f2) }; res <- boot(df, stat, R=200). Then, to get medians for example, apply(res$t, 2, median, na.rm=TRUE)

Resources