Get the row in a CSV where xyz coordinates stop changing - r

I have a measurement of an object with xyz coordinates and a timeline in ms. My CSV looks like this:
TimeInSec X.6 Y.6 Z.6
0.000000 -1.383422 -0.058891 0.023290
0.004167 -1.387636 -0.058947 0.023584
0.008333 -1.391491 -0.058972 0.023989
...
I want to find the row in my dataset where the xyz coordinates stop changing (within a threshold). The key feature I want is a time from row 0 to the stop point of my object.
My Code:
dummy.data <- read.csv (file="D:\\tmp\\dummy.csv", header = TRUE, skip = 6
dummy.data %>%
gather(key,value, X.6, X.7, X.8, Y.6, Y.7, Y.8, Z.6, Z.7, Z.8) %>%
ggplot(aes(x=Time..Seconds., y=value, colour=key)) +
geom_line()
Many Thanks for your help!
Sample Graph:
Sample Graph
Here is the link to the RawData CSV RawData

Here's an updated example that uses exactly the same code as before but now I made some dummy data that shows different offsets and the data settles to a constant value eventually. The point is that successive points will get closer and closer so a Euclidean distance (think of this as the actual distance) between successive points will get smaller. Once below the threshold, we declare the points to have settled.
library(tidyverse)
library(ggplot2)
numberofpoints <- 100
threshold <- 0.01
set.seed(1)
dummy.data <- # make some dummy data with offsets
data.frame(
X.6=runif(numberofpoints), X.7=runif(numberofpoints), X.8=runif(numberofpoints),
Y.6=runif(numberofpoints), Y.7=runif(numberofpoints), Y.8=runif(numberofpoints),
Z.6=runif(numberofpoints), Z.7=runif(numberofpoints), Z.8=runif(numberofpoints)) %>%
mutate(
X.6=3+X.6/row_number(), X.7=1+X.7/row_number(), X.8=2+X.8/row_number(),
Y.6=4+Y.6/row_number(), Y.7=6+Y.7/row_number(), Y.8=9+Y.8/row_number(),
Z.6=5+Z.6/row_number(), Z.7=7+Z.7/row_number(), Z.8=10+Z.8/row_number()
)
distances <- dist(dummy.data) # find distances between all pairs of readings (will be slow for large data)
distances.matrix <- as.matrix(distances)
# distances between adjacent readings
distancechange <- c(NA,unlist(sapply(1:numberofpoints-1, function(r) distances.matrix[r,r+1])))
# the first point below the threshold
changebelowthreshold <- min(which(distancechange < threshold))
# Plot something
dummy.data$Time <- 1:nrow(dummy.data)
thresholdtime <- dummy.data$Time[changebelowthreshold]
plotdata <- dummy.data %>% pivot_longer(cols=c(X.6, X.7, X.8, Y.6, Y.7, Y.8, Z.6, Z.7, Z.8))
gg <- ggplot(plotdata, aes(x=Time, y=value, colour=name)) + geom_line() + geom_vline(xintercept = thresholdtime)
This makes the following plot.
The vertical line shows where the data is below a threshold.

I think from your description you just want to identify a point where the differences between x, y and z values in consecutive rows drop below a certain threshold:
threshold <- 0.001
stop_row <- which(abs(diff(dummy.data$X.6)) < threshold &
abs(diff(dummy.data$Y.6)) < threshold &
abs(diff(dummy.data$Z.6)) < threshold )
So now you can do:
result <- dummy.data$TimeInSec[stop_row] - dummy.data$TimeInSec[1]

Related

Unable to make code run - Error in `lag()`:! `n`

I need help solving this error, I am not sure how to, but it seems as when I run the "dp_stat" in the end, an error appears when I do my OLS model. Essentially I try to find the cumulative difference between a benchmark and a given individual predictive model. Here dp is my independent variable and sg is my dependent variable. datanu is my excel data. I'm not sure how to attach the data here, however here is a link to the excel and the code: https://drive.google.com/drive/folders/12BOuNBODURIP7CQIBZWMmHFc1d7zXHxN?usp=sharing If anyone has a fix it would mean the world!
"#Error in lag():! n must be a positive integer, not a double vector of length 1."
rm(list= ls()) # Clear global environment
invisible(gc()) # Free up unused R-occupied memory
cat("\014") # Clear console output: equivalent to ctrl + L
library("tseries")
library("readxl")
library("Metrics")
library("lubridate")
library("ggplot2")
library("data.table")
library("dyn")
library("reshape2")
#header TRUE fordi første row er navne.
datanu <- read_xlsx("~/Documents/6.semester/Bachelor/Data/datanu.xlsx",
na = "NaN",
sheet = "datax",
)
myts <- ts(datanu, start=c(1872, 1), end=c(2020, 12), frequency=12)
plot(myts[, c("dp", "dy", "ep", "de")])
get_statistics <- function(myts, dp, sg, h=1, start=1872, end=2020, est_periods_OOS = 20) {
#### IS ANALYSIS
#1. Historical mean model for en portefølje
avg <- mean(window(myts, start, end)[, sg], na.rm=TRUE)
IS_error_N <- (window(myts, start, end)[, sg] - avg)
#2. OLS model
#reg <- dyn$lm(sg ~ lag(as.numeric(dp), 1), data=window(myts, start, end))
reg <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1), data=window(myts, start, end)) #Error in `lag()`:! `n` must be a positive integer, not a double vector of length 1.
IS_error_A <- reg$residuals
#OOS ANALYSIS
OOS_error_N <- numeric(end - start - est_periods_OOS)
OOS_error_A <- numeric(end - start - est_periods_OOS)
#anvender kun information op til forecasten er lavet.
j <- 0
for (i in (start + est_periods_OOS):(end-1)) {
j <- j + 1
#Get the actual ERP that you want to predict
actual_ERP <- as.numeric(window(myts, i+1, i+1)[, sg])
#1. Historical mean model
OOS_error_N[j] <- actual_ERP - mean(window(myts, start, i)[, sg], na.rm=TRUE)
#2. OLS model
reg_OOS <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1),
data=window(myts, start, i))
#Compute_error
df <- data.frame(x=as.numeric(window(myts, i, i)[, dp]))
names(df) <- dp
pred_ERP <- predict.lm(reg_OOS, newdata=df)
OOS_error_A[j] <- pred_ERP - actual_ERP
}
#Compute statistics
MSE_N <- mean(OOS_error_N^2)
MSE_A <- mean(OOS_error_A^2)
T <- length(!is.na(myts[, sg]))
OOS_R2 <- 1 - MSE_A/MSE_N
#Is the -1 enough (maybe -2 needed because of lag)?
OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
##
#### CREATE PLOT
IS <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
df <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
IS=IS[(1 + est_periods_OOS):length(IS)],
OOS=OOS) #Because you lose one observation due to the lag
#Shift IS errors vertically, so that the IS line begins
# at zero on the date of first OOS prediction. (se Goyal/Welch (2008, side 1465))
df$IS <- df$IS - df$IS[1]
df <- melt(df, id.var="x")
plotGG <- ggplot(df) +
geom_line(aes(x=x, y=value,color=variable)) +
geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
aes(xmin=2008, xmax=2010,ymin=-0.2,ymax=0.2),
fill='red',
alpha=0.1) +
scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
scale_x_continuous('Year')
##
return(list(IS_error_N = IS_error_N,
IS_error_A = reg$residuals,
OOS_error_N = OOS_error_N,
OOS_error_A = OOS_error_A,
IS_R2 = summary(reg)$r.squared,
IS_aR2 = summary(reg)$adj.r.squared,
OOS_R2 = OOS_R2,
OOS_oR2 = OOS_oR2,
dRMSE = dRMSE,
plotGG = plotGG))
}
dp_stat <- get_statistics(myts, "dp", "sg", start=1872)
dp_stat$plotGG
As the error message states, n must be a positive integer, not a double vector of length 1. The error comes from you providing n = -1 (i.e., a negative number) as an argument. I assume your idea is to have a negative number of positions to lag by. However, the lag() function only accepts a positive number of lag positions. Instead of lag(), you should use lead() with n = 1 to achieve the desired result.

Filter datapoints in a scatterplot using a linear equation

I am writing a function in order to filter out datapoints out of my plot based on a linear equation.
I currently have the following function (with a different function within it):
MD_filter<- function(dataframe, mz_col){
#In-function MD calculation
MZ<- mz_col
MZR<- trunc(mz_col, digits = 0)#Either floor() or trunc() can be used for this part.
MD<- as.numeric(MZ-MZR)
dataframe<- dataframe%>%
dplyr::mutate(MD)%>%
dplyr::select(MD, everything())
#fit data to m/z defect maxima equation
f<- function(x){#This could be problem `1`, maybe resolved by leaving x....
y<-0.00112*x + 0.01953
return(y)}
fit<-as.data.frame(t(apply(dataframe,1,f)))# t() transforms df to matrix...?
filtered<-dataframe[which((dataframe$MD<= fit$MZ)),]
#keep rows in dataframe if MD is less than or equal to fitted value (mz after equation)
#As "fit" calculated the maximum MD value for each MZ value in the MZ column, we subset fit$MZ, as this contains the dataframe MZ values.
#The MD calculated at the very start, needs to be lower than the equivalent MZ value of the fitted dataframe.
filtered<-write.table(filtered,"feature_list_mz_defect_filtered.txt",sep="\t",col.names=NA)
#Now we have pre filter dataframe (dataframe) and post filter df (filtered)
#2 Different plots: (highlight to be removed as well, so we need a 3rd eventually)
MD_plot<- ggplot(dataframe, aes(x= MZ, y = MD)) +
geom_point() +#THE FOLLOWING PART DOES NOT WORK YET
ggtitle(paste("Unfiltered MD data - ", dataframe))
#stat_smooth(method="lm", se=FALSE)-> For linear line through the plot, but may not be necessary to show
return(MD_plot)#While I do get a plot, I have not yet gotten the equation. I could use my earlier approach maybe.
MD_plot_2<- ggplot(filtered, aes(x= MZ, y = MD)) +#Filtered is basically the second dataframe,
#which subsets datapoints with an Y value (which is the MD), below the linear equation MD...
geom_point() +#THE FOLLOWING PART DOES NOT WORK YET
ggtitle(paste("Filtered MD data - ", dataframe))
#stat_smooth(method="lm", se=FALSE) -> For linear line through the plot, but may not be necessary to show
return(MD_plot_2)
}
The function works as follows:
The argument inputs are a dataframe and a specific column inside that dataframe which I call the mz_col.
From this column a second column, the MD column, is generated.
From here on out I want to make two plots:
ggplot 1: A plot with the mz_col (MZ) values on the X axis and the MD values on the Y axis
ggplot 2: EXACTLY the same as ggplot 1, but I want to filter out the datapoints if MD exceeds the linear equation y<-0.00112*x + 0.01953 (as is visible in the code). This linear line is basically my maximum filter limit in the plot, everything above this I want gone.
I've tried many different solutions. I swapped the "x" argument with mz_col among many other solutions such as trying to use plot() instead of ggplot. Currently I'm getting no plot, but I do get this:
Basically my question is: How do I solve my function, so I can get my two plots? The first plot is no real issue, this already works, but the second plot just won't filter out datapoints based on my linear equation.
Thanks in advance! I'm quite new to SO and R, so I apologize if anything is unclear. Please let me know if any clearification is needed and thanks in advance for all the help!
Reproducable sample data:
structure(list(mz = c(446.0394, 346.043, 199.0446, 199.0464, 97.057, 657.0595, 879.0606, 978.0631, 199.0684, 199.0707, 199.0724, 86.0748, 199.0761, 196.0789, 199.0819, 199.0852, 199.0878, 199.089, 908.0935, 147.0958, 199.0999,199.1299, 199.1322, 199.1384, 199.1398, 199.1434, 124.1475, 199.1513, 187.156, 199.1686, 199.1766, 199.1797, 199.1879, 199.1924, 187.1959, 479.1981, 169.1999, 109.2049, 399.2092, 299.2125, 159.2146, 199.2242, 356.2405, 69.2423, 956.4337, 978.5537, 199.5695, 676.5769, 199.5851, 500.6021, 260.6039, 270.6081, 200.6114, 200.6131, 200.6172, 200.6221,
200.6315, 200.6402, 200.6476, 200.766, 200.8591, 200.8732, 200.8768,
200.89, 200.8937, 200.8972, 200.9067, 200.9127, 200.9147, 200.9231,
200.9253, 200.9288, 200.9324, 200.935, 200.9468, 200.9515, 200.9536,
200.9557, 200.9568, 200.9594, 200.9661, 200.968, 200.9729, 200.9745,
200.9819, 200.9837, 200.9858, 200.9937)), row.names = c(NA, -88L), class = c("tbl_df", "tbl", "data.frame"))
I got a bit lost trying to follow your code, but based on your description, does the following work for you?
library(dplyr)
library(ggplot2)
MD_filter <- function(dataframe, mz_col, a = 0.01953, b = 0.00112){
# rename column so that rest of function doesn't depend on inputted column name
dataframe[["MZ"]] <- dataframe[[mz_col]]
# process dataframe
dataframe <- dataframe %>%
select(MZ) %>%
mutate(MD = MZ - trunc(MZ, digits = 0),
MD.limit = a + b*MZ)
p1 <- ggplot(dataframe,
aes(x = MZ, y = MD)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
ggtitle("Unfiltered MD data")
p2 <- p1 %+% filter(dataframe, MD <= MD.limit) +
expand_limits(y = range(dataframe[["MD"]])) + # optional (if you want same
# y-axis range for both plots)
ggtitle("Filtered MD data")
cowplot::plot_grid(p1, p2, nrow = 1)
}
Data & usage
dd <- structure(list(mz = c(
446.0394, 346.043, 199.0446, 199.0464, 97.057, 657.0595, 879.0606,
978.0631, 199.0684, 199.0707, 199.0724, 86.0748, 199.0761, 196.0789,
199.0819, 199.0852, 199.0878, 199.089, 908.0935, 147.0958, 199.0999,
199.1299, 199.1322, 199.1384, 199.1398, 199.1434, 124.1475, 199.1513,
187.156, 199.1686, 199.1766, 199.1797, 199.1879, 199.1924, 187.1959,
479.1981, 169.1999, 109.2049, 399.2092, 299.2125, 159.2146, 199.2242,
356.2405, 69.2423, 956.4337, 978.5537, 199.5695, 676.5769, 199.5851,
500.6021, 260.6039, 270.6081, 200.6114, 200.6131, 200.6172, 200.6221,
200.6315, 200.6402, 200.6476, 200.766, 200.8591, 200.8732, 200.8768,
200.89, 200.8937, 200.8972, 200.9067, 200.9127, 200.9147, 200.9231,
200.9253, 200.9288, 200.9324, 200.935, 200.9468, 200.9515, 200.9536,
200.9557, 200.9568, 200.9594, 200.9661, 200.968, 200.9729, 200.9745,
200.9819, 200.9837, 200.9858, 200.9937)),
row.names = c(NA, -88L),
class = c("tbl_df", "tbl", "data.frame"))
MD_filter(dd, "mz")
# MD_filter(dd, "mz", a = 0.02, b = 0.001) # if you want to change the limit

paired data for a facet_wrap

Imagine I have data foo below. Each row contains a measurement (y) on a species and each species is paired with another (species.pair). So in the example below, species a is paired with e, b with f, and so on. The number of observations for each species varies. I'd like to plot the density of each species's distribution along with its partner's distribution in its own facet. Below I hand coded this with the column sppPairs. The species are all unique and each has a match in species.pair. I'm unsure of how to make the grouping column sppPairs below. I'm sure there is some clever way to do this with {dplyr} but I can't figure out what to do. Some kind of pasting species to species.pair I imagine? Any help much appreciated.
foo <- data.frame(species = rep(letters[1:8],each=10),
species.pair = rep(letters[c(5:8,1:4)],each=10),
y=rnorm(80))
# species and species pair match exactly
all(unique(foo$species) %in% unique(foo$species.pair))
# what I want
foo$sppPairs <- c(rep("a:e",10),
rep("b:f",10),
rep("c:g",10),
rep("d:h",10),
rep("a:e",10),
rep("b:f",10),
rep("c:g",10),
rep("d:h",10))
p1 <- ggplot(foo,aes(y,fill=species))
p1 <- p1 + geom_density(alpha=0.5)
p1 <- p1 + facet_wrap(~sppPairs)
p1
Yes, you can use apply on the appropriate columns to paste the sorted elements together in the correct order (otherwise a:e is different from e:a and so on, and you end up with 8 groups instead of 4):
library(ggplot2)
foo <- data.frame(species = rep(letters[1:8], each = 10),
species.pair = rep(letters[c(5:8, 1:4)], each = 10),
y = rnorm(80))
foo$sppPairs <- apply(foo[c("species", "species.pair")], 1,
function(x) paste(sort(x), collapse = ":"))
ggplot(foo, aes(y, fill = species)) +
geom_density(alpha = 0.5) +
facet_wrap(~sppPairs)
Created on 2020-10-05 by the reprex package (v0.3.0)

R parameterization of constants in time series

I have a fairly simple equation, in which I have direct measurements of the variables through time, and two different unknown parameters I need to solve for, but which I know can be considered constants over the time periods I'm studying.
Both of these "constants" have fairly narrow ranges of variability in nature. In principle, it seems like some kind of optimization procedure/function should be able to do this easily, by finding the pair of values that minimizes the standard deviation of each of the constant values across the time series.
However, I am new to optimization and parameter fitting. Any help figuring out how to use r code to find the pair (or pairs) of values in this situation would be greatly appreciated.
Below is a simplified form of the equation I'm dealing with:
A * x + B * z - B * d = c + e
A and B are the constants I need to solve for.
Possible real-world values of A are 0.4-0.8
Possible real-world values of B are 0.85-0.99
To create a reasonable mock data set, assuming perfect measurements of all variables, and known values of A and B:
### Generate mock data
### Variables all have a daily cycle and are strongly autocorrelated,
# and so can be approximated via sin function,
# with unique noise added to each to simulate variability:
# Variability for each variable
n <- 1000 # number of data points
t <- seq(0,4*pi,length.out = 1000)
a <- 3
b <- 2
x.unif <- runif(n)
z.norm <- rnorm(n)
c.unif <- runif(n)
d.norm <- rnorm(n)
d.unif <- runif(n)
e.norm <- rnorm(n)
amp <- 1
# Create reasonable values of mock variable data for all variables except e;
# I will calculate from known fixed values for A and B.
x <- a*sin(b*t)+x.unif*amp + 10 # uniform error
z <- a*sin(b*t)+z.norm*amp + 10 # Gaussian/normal error
c <- ((a*sin(b*t)+c.unif*amp) + 10)/4
d <- ((a*sin(b*t)+d.norm*amp)+(a*sin(b*t)+d.unif*amp)+10)/2
# Put vectors in dataframe
dat <- data.frame("t" = t, "x" = x, "z" = z, "c" = c, "d" = d)
# Equation: A*x + B*z - B*d = c + e
# Solve for e:
# e = A*x + B*z - B*d - c
# Specify "true" values for A and B:
A = 0.6
B = 0.9
# Solve for e:
dat <- dat %>%
mutate(e = A*x + B*z - B*d - c)
# Gather data for easy visualizing of results for e:
dat_gathered <- dat %>%
gather(-t, value = "value", key = "key")
# Plot all variables
ggplot(dat_gathered, aes(x = t, y = value, color = key)) + geom_line()
# Add small error (to simulate measurement error) to all variables except A and B:
dat <- dat %>%
mutate(x_j = x + rnorm(x, sd=0.02)/(1/x)) %>%
mutate(z_j = z + rnorm(z, sd=0.02)/(1/z)) %>%
mutate(c_j = c + rnorm(c, sd=0.02)/(1/c)) %>%
mutate(d_j = d + rnorm(d, sd=0.02)/(1/d)) %>%
mutate(e_j = e + rnorm(e, sd=0.02)/(1/e))
The variables in dat with the _j suffix represent real world data (since they have measurement error added). Knowing the constraint that:
A is within 0.4-0.8
B is within 0.85-0.99
Is it possible to use the noisy "_j" data to optimize for the pair of constant values that minimize deviation of A and B across the entire time series?
A little bit of algebra and setting this up as a linear regression problem with no intercept seems to work fine:
m1 <- lm(e_j+c_j ~ 0 + x_j + I(z_j-d_j), data=dat)
coef(m1) ## A =0.6032, B = 0.8916
It doesn't do anything to constrain the solution, though.

How to enter data into ggplots

I'm trying to enter the below data into a data frame, to make a ggplot line graph.
#functions for the hh budget and utility functions
pqxf <- function(y)(1*y) # replace p with price of y
pqyf <- function(x)(-1.25*x)+20 # -1.25 is the wage rate
utilityf <- function(x)80*(1/(x)) # 80 is the utility provided
hours <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20)
#functions are turned into data frames
pqy <- data.frame("consumption" =
pqxf(c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20)))
pqx <- data.frame("leisure" =
pqxf(c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20)))
utility <- data.frame("utility" =
utilityf(c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20)))
#each data frame is combined into a single data frame, that will be used for tables and charts
hh <- data.frame(pqx, pqy, utility, hours)
print(hh)
#this shows the utility, and the cost of x and y, one data frame
library(ggplot2)
ggplot(hh, aes(x=pqx, y=hours))+
xlim(0,20)+ylim(0,20)+ # limits set for the assignment
labs(x = "leisure(hours)",y="counsumption(units)")+
geom_line(aes(x = pqx, y = pqy))+
geom_line(aes(x = pqx, y = utility))+
geom_point(aes(x=8,y=10))+ #values of x and y of tangent point
geom_hline(yintercept = 10,linetype="dotted")+ # y of tangent point
geom_vline(xintercept = 8,linetype = "dotted")+ #x of tangent point
geom_text(label="E", x=8,y=10,hjust=-1,size=2)+
geom_text(label="-1.25(units/hour)= -w = MRS", x=9,y=2,hjust=.02,size=2)+
geom_text(label="U=80", x=4,y=19,hjust=1,size=2)
when I enter I get the following message:
Error in is.finite(x) : default method not implemented for type 'list'
Should I store data in a different format than a data frame? format my data frame differently, or set up ggplot differently, so that it can handle lists?
Try to replace pqx with leisure, and pqy with comsumption.

Resources