Find abrupt slopes in data - r

I have a high-resolution vertical profile of a pavement surface with X and Y coordinates and I'm looking for abrupt increases in Y which could be attributed to a trip hazard (classed as a 6 mm increase). I'm using the findpeaks command in pracma but it's not doing what I want (or I'm not using it properly). What I need to do is detect increases in Y of at least 6 mm over a specified distance of X, let's say 100 mm for this example, and to record the maximum value of Y over the increase. Essentially the highest point of the 'trip hazard'.
here's the data (units are mm for X and Y)
x <- seq (0, 2080, by = 10)
y<- c(1.21, 1.67, 2.10, 2.50, 2.88, 3.24, 3.56, 3.85, 4.11, 4.33, 4.53, 4.70, 4.84, 4.94, 4.99, 4.98, 4.95, 4.95, 4.91, 4.82, 4.80, 4.95, 5.20, 5.39, 5.44, 5.44, 5.48, 5.58, 5.73,
5.93, 6.17, 6.60, 7.13, 7.45, 7.52, 7.53, 7.49, 7.11, 6.46, 6.03, 6.01, 6.16, 6.38, 6.57, 6.78, 7.05, 7.22, 7.14, 6.94, 6.82, 6.80, 6.79, 6.79, 6.86, 7.01, 7.17, 7.26, 7.26,
7.21, 7.14, 7.13, 7.13, 7.04, 6.89, 6.72, 6.43, 5.90, 5.17, 4.42, 3.80, 3.30, 2.81, 2.38, 2.01, 1.69, 1.45, 1.29, 1.20, 1.17, 1.25, 1.44, 1.65, 1.80, 1.94, 2.11, 2.24, 2.19,
2.04, 1.97, 2.05, 2.17, 2.29, 2.39, 2.50, 2.61, 2.70, 2.69, 2.62, 2.61, 2.71, 2.84, 2.97, 3.20, 3.50, 3.71, 3.79, 3.80, 3.77, 3.73, 3.67, 3.60, 3.52, 3.40, 3.24, 3.12, 3.10,
3.14, 3.13, 3.06, 2.96, 2.83, 2.65, 2.32, 1.90, 1.64, 1.62, 1.66, 1.71, 1.85, 2.11, 2.30, 2.37, 2.42, 2.47, 2.53, 2.56, 2.56, 2.59, 2.83, 3.19, 3.43, 3.43, 3.33, 3.19, 2.96,
2.64, 2.34, 2.18, 2.18, 2.22, 2.27, 2.46, 2.78, 2.96, 2.93, 2.83, 2.68, 2.43, 2.05, 1.65, 1.30, 0.98, 0.66, 0.41, 0.15, -0.11, -0.26, -0.28, -0.24, -0.09, 0.30, 0.88, 1.51,
2.06, 2.56, 3.06, 3.49, 3.65, 3.67, 3.92, 4.36, 4.83, 5.47, 6.52, 7.88, 9.30, 10.48, 11.40, 12.24, 13.03, 13.65, 14.12, 14.65, 15.24, 15.81, 16.43, 17.16, 17.97, 18.76,
19.45, 20.04, 20.59, 21.04, 21.39, 21.67, 21.86, 21.95, 21.95, 21.87)
data<- data.frame(x,y)
and here's the code I'm using the moment
plot(x, y, ylim=c(0, 30), xlim = c(0, 2200), cex=0.2, type='o')
grid()
## FROM LEFT TO RIGHT
peaks_1<-data.frame(findpeaks(data$y, minpeakheight = 6, threshold = 0,
nups = 10, ndowns = 0, minpeakdistance = 1, sortstr=F))
## FROM RIGHT TO LEFT
peaks_2<-data.frame(findpeaks(data$y, minpeakheight = 6, threshold = 0,
nups = 0, ndowns = 10, minpeakdistance = 1, sortstr=F))
peaks<-rbind(peaks_1, peaks_2)
colnames(peaks)<-c("y", "X2", "X3", "X4")
peak_points<- data.frame(merge(peaks, data, by='y'))
## NOTE: I HAVE ROUNDED THE RAW DATA FOR THIS EXAMPLE AND SO WHEN THE DATA ARE MERGED,
## IT PRODUCES THREE ADDITIONAL VALUES WHICH WE WILL MANUALLY REMOVE HERE
peak_points<- peak_points[-c(1, 2, 5),]
points(peak_points$x, peak_points$y,pch=19, cex=1,col='maroon')
The one on the right (21.95 mm) seems correct, and maybe the one in the middle (7.13 mm), but the one on the left doesn't (7.53 mm). Is there a way I can use pracma (or anything else) to specify the minimum increase with the nups command?

You could calculate the position approximately by doing a stepwise regression on a number of polynomials. We use the best fit to get an estimate yhat.
fit <- step(lm(as.formula(paste("y ~ ", paste0("I(x^", 1:(length(x)/2.3), ")",
collapse=" + ")))))
yhat <- fit$fitted.values
Now we are able to calculate second derivatives; where it's greater than zero we have local minima, and where it's less than zero we have local maxima.
lmin <- which(c(FALSE, diff(diff(yhat) > 0) > 0))
lmax <- which(c(FALSE, diff(diff(yhat) < 0) > 0))
lmax <- lmax[lmax > min(lmin)] ## delete lmax appearing before first lmin
Now we subtract lmax from lmin and those with difference > 6 are the POSitions we are looking for.
mp <- - mapply(`-`, yhat[lmin], yhat[lmax])
POS <- x[as.numeric(names(mp[mp > 6]))]
Looks like this:
plot(x, y, cex=0.2, type='o', main="Trip hazard")
grid()
lines(x, yhat, col=6, lty=2)
abline(v=x[lmin], lwd=1, lty=3, col=3)
abline(v=x[lmax], lwd=1, lty=3, col=4)
abline(v=POS, col="red", lwd=2)
legend("topleft", legend=c("y", "yhat", "lmin", "lmax", "POS"),
lwd=c(1, 1, 1, 1, 2), lty=c(1, 2, 3, 3, 1), col=c(1, 6, 3, 4, "red"))

Here's a simple brute-force approach; if your dataset isn't too big, it should be adequate.
# All the code below assumes that `data` is already sorted by x
# Flag every point within the range of the trip hazard
data$trip_hazard = F
# Iterate over every pair of points
for(i in 1:(nrow(data) - 1)) {
for(j in (i + 1):nrow(data)) {
# Get the x-coordinates of the points
x1 = data$x[i]
x2 = data$x[j]
# If the points are no more than 100 mm apart, check whether there's a trip
# hazard between them
if(x2 - x1 <= 100) {
# Get the y-coordinates of the points
y1 = data$y[i]
y2 = data$y[j]
# If there's a rise or fall of at least 6 mm, we have a trip hazard; flag
# all the points in the range accordingly
if(abs(y2 - y1) >= 6) {
data$trip_hazard[i:j] = T
}
}
# If the points are more than 100 mm apart, we don't need to keep checking
# points that are even further apart
else {
break
}
}
}
# Get the maximum y-value within each trip hazard
library(dplyr)
library(tidyr)
data = data %>%
mutate(range_id = ifelse(trip_hazard != coalesce(lag(trip_hazard),
!trip_hazard),
x, NA)) %>%
fill(range_id) %>%
group_by(range_id) %>%
mutate(peak = trip_hazard & y == max(y)) %>%
ungroup() %>%
dplyr::select(-range_id)
# Plot the sidewalk (repeated from question)
plot(x, y, ylim = c(0, 30), xlim = c(0, 2200), cex = 0.2, type = "o")
# Plot the trip hazards in red
points(data$x[data$trip_hazard], data$y[data$trip_hazard],
lwd = 4, col = "red", type = "l")
# Plot the highest point within each trip hazard
points(data$x[data$peak], data$y[data$peak], pch = 19, cex = 2, col = "red")

I wrote a program whose output is the distinct points and end-points of trip hazards. It takes three arguments: the increment of data (how many x-data points is in your interval of interest), the elevation threshold, and the data set. From there, it will produce output that specifies both where the elevation change is greater than allowed and by virtue of the output will demonstrate in which direction.
get.vector.right <- function(i, increment, data){
return(data$y[i:(i+increment)])
}
get.vector.left <- function(i, increment, data){
return(data$y[(i - increment):i])
}
get.vector.right.abridged <- function(i, increment, data){
return(data$y[i : nrow(data)])
}
get.vector.left.abridged <- function(i, increment, data){
return(data$y[1 : i])
}
print.warning <- function(data, i, increment, direction){
if(direction == "right"){
print(paste0("Steep change in vertical distance noted between ", data$x[i], " and ", data$x[(i + increment)]))
} else if(direction == "left"){
print(paste0("Steep change in vertical distance noted between ", data$x[i], " and ", data$x[(i - increment)]))
}
}
check.right.up <- function(vector, increment, vertical.distance, data, i){
if(max(vector) - vector[1] >= vertical.distance){
print.warning(data, i, increment, "right")
}
}
check.right.down <- function(vector, increment, vertical.distance, data, i){
if(vector[1] - min(vector) >= vertical.distance){
print.warning(data, i, increment, "right")
}
}
check.left.up <- function(vector, increment, vertical.distance, data, i){
if(max(vector) - vector[length(vector)] >= vertical.distance){
print.warning(data, i, increment, "left")
}
}
check.left.down <- function(vector, increment, vertical.distance, data, i){
if(vector[length(vector)] - min(vector) >= vertical.distance){
print.warning(data, i, increment, "left")
}
}
check.function <- function(left.vector, right.vector, increment, vertical.distance, data, i){
check.left.up(left.vector, increment, vertical.distance, data, i)
check.left.down(left.vector, increment, vertical.distance, data, i)
check.right.up(right.vector, increment, vertical.distance, data, i)
check.right.down(right.vector, increment, vertical.distance, data, i)
}
trip.function <- function(increment, vertical.distance, data){
for(i in 1:nrow(data)){
if(data$x[i] == min(data$x)){
get.vector.right(i, increment, data) -> right.vector
check.right.up(right.vector, increment, vertical.distance, data, i)
check.right.down(right.vector, increment, vertical.distance, data, i)
} else if (data$x[i] == max(data$x)){
get.vector.left(i, increment, data) -> left.vector
check.left.up(left.vector, increment, vertical.distance, data, i)
check.left.down(left.vector, increment, vertical.distance, data, i)
} else {
if(nrow(data[1:i, ]) <= increment){
get.vector.left.abridged(i, increment, data) -> left.abridged.vector
get.vector.right(i, increment, data) -> right.vector
check.function(left.abridged.vector, right.vector, increment, vertical.distance, data, i)
} else if (nrow(data[i:nrow(data), ]) <= increment){
get.vector.right.abridged(i, increment, data) -> right.abridged.vector
get.vector.left(i, increment, data) -> left.vector
check.function(left.vector, right.abridged.vector, increment, vertical.distance, data, i)
} else {
get.vector.left(i, increment, data) -> left.vector
get.vector.right(i, increment, data) -> right.vector
check.function(left.vector, right.vector, increment, vertical.distance, data, i)
}
}
rm(right.vector, left.vector, left.abridged.vector, right.abridged.vector)
}
}
Thus, if you wanted to know if there were any 6mm changes within 100mm, you would type (assuming 10 data points on the x-axis represents 100mm and the y-axis is recorded in mm):
trip.function(10, 6, data)
and the output would be:
[1] "Steep change in vertical distance noted between 1750 and 1850"
[1] "Steep change in vertical distance noted between 1760 and 1860"
[1] "Steep change in vertical distance noted between 1770 and 1870"
[1] "Steep change in vertical distance noted between 1780 and 1880"
[1] "Steep change in vertical distance noted between 1790 and 1890"
[1] "Steep change in vertical distance noted between 1800 and 1900"
[1] "Steep change in vertical distance noted between 1810 and 1910"
[1] "Steep change in vertical distance noted between 1820 and 1920"
[1] "Steep change in vertical distance noted between 1830 and 1930"
[1] "Steep change in vertical distance noted between 1840 and 1940"
[1] "Steep change in vertical distance noted between 1850 and 1750"
[1] "Steep change in vertical distance noted between 1850 and 1950"
[1] "Steep change in vertical distance noted between 1860 and 1760"
[1] "Steep change in vertical distance noted between 1860 and 1960"
[1] "Steep change in vertical distance noted between 1870 and 1770"
[1] "Steep change in vertical distance noted between 1870 and 1970"
[1] "Steep change in vertical distance noted between 1880 and 1780"
[1] "Steep change in vertical distance noted between 1880 and 1980"
[1] "Steep change in vertical distance noted between 1890 and 1790"
[1] "Steep change in vertical distance noted between 1890 and 1990"
[1] "Steep change in vertical distance noted between 1900 and 1800"
[1] "Steep change in vertical distance noted between 1900 and 2000"
[1] "Steep change in vertical distance noted between 1910 and 1810"
[1] "Steep change in vertical distance noted between 1910 and 2010"
[1] "Steep change in vertical distance noted between 1920 and 1820"
[1] "Steep change in vertical distance noted between 1920 and 2020"
[1] "Steep change in vertical distance noted between 1930 and 1830"
[1] "Steep change in vertical distance noted between 1930 and 2030"
[1] "Steep change in vertical distance noted between 1940 and 1840"
[1] "Steep change in vertical distance noted between 1950 and 1850"
[1] "Steep change in vertical distance noted between 1960 and 1860"
[1] "Steep change in vertical distance noted between 1970 and 1870"
[1] "Steep change in vertical distance noted between 1980 and 1880"
[1] "Steep change in vertical distance noted between 1990 and 1890"
[1] "Steep change in vertical distance noted between 2000 and 1900"
[1] "Steep change in vertical distance noted between 2010 and 1910"
[1] "Steep change in vertical distance noted between 2020 and 1920"
[1] "Steep change in vertical distance noted between 2030 and 1930"
The order of the numbers indicate in which direction: 2030 and 1930 indicates movement from x = 2030 to x = 1930 (left-ward movement) and vice versa.

Related

How can I make a plot of first derivative of loglikelihood of Cauchy Distribution for different thetas in R

I have a set of observations from a Cauchy (theta,1) and I have a plot for the log-likelihood against different x values
obs=c(1.77, -0.23, 2.76, 3.80, 3.47, 56.75, -1.34, 4.24, -2.44, 3.29, 3.71, -2.40, 4.53, -0.07, -1.05, -13.87, -2.53, -1.75, 0.27, 43.21)
ll_c=function(theta, obs){ #define Loglikelihood function for Cauchy(θ,1) distribution
logl= sum(dcauchy(obs, location = theta, scale = 1, log = T))
return(logl)
}
x = seq(from=-10,to=10,by=0.1) #create test values
ll = NULL
for (i in x){
ll = c(ll, ll_c(i, obs)) #perform ll_c for all test values and store
}
plot(x, ll)
I also need to make a plot of the first derivative of the log-likelihood function against the same x values and I can not figure out how to do so.
fdll_c=function(theta,obs){
Dlogl=D(sum(dcauchy(obs,location=theta,scale=1,log=T)),'theta')
return(Dlogl)
}
fdll = NULL
for (j in x){
fdll = c(fdll, fdll_c(j,obs))
}
plot(x,fdll)
I have tried different variations on this code, but every time it has come back with an error or with a derivative of 0 at all points.
Maybe the following answers the question.
It uses an explicit log-likelihood partial derivative function and then applies it to a vector around 0.
obs <- c(1.77, -0.23, 2.76, 3.80, 3.47, 56.75, -1.34, 4.24, -2.44, 3.29, 3.71, -2.40, 4.53, -0.07, -1.05, -13.87, -2.53, -1.75, 0.27, 43.21)
dll_theta <- function(x, theta, scale){
cc <- (x - theta)/scale
-2*sum(1/cc)/scale
}
x <- seq(from = -10, to = 10, by = 0.001)
y <- sapply(x, function(.x) dll_theta(obs, theta = .x, scale = 1))
i <- which(abs(y) > 1e15)
plot(x[-i], y[-i], pch = ".")

Gaussian kernel density estimation in R

I am having trouble understanding how to implement a Gaussian kernel density estimation of the following dataset in R. I appreciate if you can help me understand the mechanism of how to do it. I am currently trying to get a formula for the bell shaped curves at the bottom of the following picture. As you can see there is one bell shaped curve for each data point. (Note the picture does not represent the data I am using.)
This is my data:
x<-c(4.09, 4.46, 4.61, 4.30, 4.03, 5.22, 4.21, 4.07, 4.02, 4.58, 4.66, 4.05, 4.23, 5.51, 4.03, 4.72, 4.47, 4.50, 5.80, 4.30, 4.09, 4.78, 4.18, 4.45, 4.40, 5.60, 4.37, 4.42, 4.88, 4.20, 4.45, 4.10, 4.43, 4.58, 4.40, 4.38)
(x has 36 elements)
This is the kernel density estimator:
(If you can't see the image, it's from this page http://sfb649.wiwi.hu-berlin.de/fedc_homepage/xplore/tutorials/xlghtmlnode33.html)
where K(u)=
is the Gaussian kernel function and h=.1516 is the bandwidth selected by Scott.
So, plugging in we get f hat (x) = 1/(36*.1516) (1/sqrt(2pi))[e^(-1/2 ((4.09-x)/.1516)^2 + e^(-1/2 ((4.46-x)/.1516)^2 + ... + e^(-1/2 ((4.38-x)/.1516)^2]
Ok. So we have a function of x. But how do we get the equation of each of the bell shaped curves in the above diagram? If we plug in, for example, 4.09, into f hat (x) we get a number, not a curve/function/distribution. Can someone help me understand the procedure to find the equation for the bell shaped curve/kernel density estimate?
Here's a function that will return your fhat function given your x values and h value
get_fhat <- function(x, h) {
Vectorize(function(z) 1/length(x)/h*sum(dnorm((x-z)/h)))
}
This function returns a function that we can use to get values. We Vectorize it so we can pass in multiple values at once to the function.
We can get a single value or plot it with
fhat <- get_fhat(x, .1516)
fhat(4.09)
# [1] 0.9121099
curve(fhat, from=min(x), to=max(x))
Graph
## Given data
x <- c(4.09, 4.46, 4.61, 4.30, 4.03, 5.22, 4.21, 4.07, 4.02, 4.58, 4.66, 4.05,
4.23, 5.51, 4.03, 4.72, 4.47, 4.50, 5.80, 4.30, 4.09, 4.78, 4.18, 4.45,
4.40, 5.60, 4.37, 4.42, 4.88, 4.20, 4.45, 4.10, 4.43, 4.58, 4.40, 4.38)
h <- 0.1516
# GaussianKernel
GK <- function(u) {(1/sqrt(2*pi))*exp(-(u^2)/2)} # or dnorm(u)
This function gives a similar plot.
DensityGraph <- function(x, h){
n <- length(x)
xi <- seq(min(x) - sd(x), max(x) + sd(x), length.out = 512)
# fhat without sum since we are interest in the bell shaped curves
fhat <- sapply(x, function(y){(1/(n*h))*GK((xi - y)/h)})
# histogram of x
hist (x, freq = FALSE, nclass = 15, main = "Kernel density with histogram",
xlab = paste("N = ", n, " ", "Bandwidth = ", h))
# add fhat with sum
lines(xi, rowSums(fhat), lwd = 2)
# add the bell shaped curves
apply(fhat, 2, function(j) lines(xi, j, col = 4))
# show data points
rug (x, lwd = 2, col = 2)
}
DensityGraph(x = x, h = 0.05)
Blue bell shaped curves represent each data point of x
DensityGraph(x = x, h = 0.1516)
Compare with built in density function in R
lines(density(x = x, bw = 0.1516), col = 3, lwd = 2)
fhat for each x
This function gives the value of fhat given a specific x
fhat <- function(x, h, specific_x){
n <- length(x)
xi <- seq(min(x) - sd(x), max(x) + sd(x), length.out = 512)
f <- rowSums(sapply(x, function(y){(1/(n*h))*GK((xi - y)/h)}))
kde <- data.frame(xi, fhat = f)
indx <- which.min(abs(xi - specific_x))
fx <- kde[indx, "fhat"]
list(fx = fx, kde = kde)
}
KernelDensity <- fhat(x = x, h = 0.1516, specific_x = 4.09)
KernelDensity$fx
# [1] 0.9114677
plot(KernelDensity$kde, type = "l", lwd = 2, xlab = "")
title(xlab = paste("N = ", n, " Bandwidth = ", h))
rug(x, lwd = 2, col = 2)
Compare built in density function
lines(density(x, bw = 0.1516), col = 5)

Creating a 2D-grid or raster in R comparing all respondents with all variables

reproducible example for my data:
df_1 <- data.frame(cbind("Thriving" = c(2.33, 4.21, 6.37, 5.28, 4.87, 3.92, 4.16, 5.53), "Satisfaction" = c(3.45, 4.53, 6.01, 3.87, 2.92, 4.50, 5.89, 4.72), "Wellbeing" = c(2.82, 3.45, 5.23, 3.93, 6.18, 4.22, 3.68, 4.74), "id" = c(1:8)))
As you can see, it includes three variables of psychological measures and one identifier with an id for each respondent.
Now, my aim is to create a 2D-grid with which I can have a nice overview of all the values for all respondents concerning each of the variables. So on the x-axis I would have the id of all the respondents and on the y-axis all variables, whereas the colour of the particular field depends on the value - 1 to 3 in red, 3 to 5 in yellow and 5 to 7 in green The style of the grid should be like this image.
All I have achieved so far is the following code which compresses all the variables/items into one column so they can together be portrayed on the y-axis - the id is of course included in its own column as are the values:
df_1 %>%
select("Thr" = Thriving, "Stf" = Satisfaction, "Wb" = Wellbeing, "id" = id) %>%
na.omit %>%
gather(key = "variable", value = "value", -id) %>%
I am looking for a solution that works without storing the data in a new frame.
Also, I am looking for a solution that would be useful for even 100 or more respondents and up to about 40 variables. It would not matter if one rectangle would then be very small, I just want to have a nice colour play which would give a nice taste of where an organisation may be achieving low or high - and how it is achieving in general.
Thanks for reading, very grateful for any help!
There is probably a better graphics oriented approach, but you can do this with base plot and by treating your data as a raster:
library(raster)
df_1 <- cbind("Thriving" = c(2.33, 4.21, 6.37, 5.28, 4.87, 3.92, 4.16, 5.53), "Satisfaction" = c(3.45, 4.53, 6.01, 3.87, 2.92, 4.50, 5.89, 4.72), "Wellbeing" = c(2.82, 3.45, 5.23, 3.93, 6.18, 4.22, 3.68, 4.74), "id" = c(1:8))
r <- raster(ncol=nrow(df_1), nrow=3, xmn=0, xmx=8, ymn=0, ymx=3)
values(r) <- as.vector(as.matrix(df_1[,1:3]))
plot(r, axes=F, box=F, asp=NA)
axis(1, at=seq(-0.5, 8.5, 1), 0:9)
axis(2, at=seq(-0.5, 3.5, 1), c("", colnames(df_1)), las=1)

How to fit a linear regression in R with a fixed negative intercept?

Background: Species richness scales to the negative -0.75 of body weight. However, when I fit my data, I get a value of 0.57. A friend told me that the summary(lm) results prints the 'best fit' slope of the data. Nevertheless, I'm wondering if I can create a regression plot wherein I force the slope to be -0.75 like the literature.
My code is:
y value
log.nspecies.dec = c(3.05, 2.95, 2.97, 2.98, 2.84, 2.85, 2.83, 2.71, 2.64, 2.62, 2.58, 2.37,
2.26, 2.17, 2.00, 1.88, 1.75, 1.62, 1.36, 1.30, 1.08, 1.20, 0.90, 0.30, 0.70,
0.30, 0.48, 0.00, 0.30, 0.00)
x value
logbio.dec = c(2.1, 2.3, 2.5, 2.7, 2.9, 3.1, 3.3, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9, 5.1,
5.3, 5.5, 5.7, 5.9, 6.1, 6.3, 6.5, 6.7, 6.9, 7.1, 7.3, 7.5, 7.7, 7.9)
plot a barplot and superimpose a regression line
name the y variables with the x
names(log.nspecies.dec) = logbio.dec
order the y variables
log.nspecies.dec = log.nspecies.dec[order (as.numeric(names(log.nspecies.dec)))]
do the barplot
xpos = barplot(log.nspecies.dec, las = 2, space = 0)
lm.fit = lm(log.nspecies.dec ~ as.numeric(names(log.nspecies.dec)))
summary(lm.fit)
y.init = lm.fit$coefficients[2] * as.numeric(names(log.nspecies.dec))1 +
lm.fit$coefficients1
y.end = lm.fit$coefficients[2] * as.numeric(names(log.nspecies.dec))[length(log.nspecies.dec)] +
lm.fit$coefficients1
segments(xpos1, y.init, xpos [length(xpos)], y.end, lwd = 2, col = 'red')
title(main = 'ln Number of species ~ lm Weight')
coef(lm.fit)
gives a result wherein the slope is 0.57. How do I force the slope to -0.75?
You can use offset to fix the y-intercept at a negative value. For example
## Example data
x = 1:10
y = -2 + 2* x
# Fit the model
(m = lm(y ~ 0 + x, offset = rep(-2, length(y))))
#Call:
#lm(formula = y ~ 0 + x, offset = rep(-2, length(y)))
#Coefficients:
#x
#2
The output correctly identifies the gradient as 2.
The reason your code doesn't work is that you are using abline(). Looking at ?abline, it states that to draw the line it will in turn call coef(MODEL). When you use offset, the coef function doesn't return the y-intercept.
R> coef(m)
x
2
Hence abline draws the wrong line.
If the intercept is changed, the code still works
x = 1:10
y = 2 + 2*x
lm(y ~ 0 + x, offset = rep(2, length(y)))

Negative exponential fit: curve looks too high

I am trying to fit a negative exponential to some data in R, but the fitted line looks too high compared to the data, whereas the fit I get using Excel's built-in power fit looks more believable. Can someone tell me why? I've tried using the nls() function and also optim() and get similar parameters from both of those methods, but the fits for both look high.
x <- c(5.96, 12.86, 8.40, 2.03, 12.84, 21.44, 21.45, 19.97, 8.92, 25.00, 19.90, 20.00, 20.70, 16.68, 14.90, 26.00, 22.00, 22.00, 10.00, 5.70, 5.40, 3.20, 7.60, 0.59, 0.14, 0.85, 9.20, 0.79, 1.40, 2.68, 1.91)
y <- c(5.35, 2.38, 1.77, 1.87, 1.47, 3.27, 2.01, 0.52, 2.72, 0.85, 1.60, 1.37, 1.48, 0.39, 2.39, 1.83, 0.71, 1.24, 3.14, 2.16, 2.22, 11.50, 8.32, 38.98, 16.78, 32.66, 3.89, 1.89, 8.71, 9.74, 23.14)
xy.frame <- data.frame(x,y)
nl.fit <- nls(formula=(y ~ a * x^b), data=xy.frame, start = c(a=10, b=-0.7))
a.est <- coef(nl.fit)[1]
b.est <- coef(nl.fit)[2]
plot(x=xy.frame$x,y=xy.frame$y)
# curve looks too high
curve(a.est * x^b.est , add=T)
# these parameters from Excel seem to fit better
curve(10.495 * x^-0.655, add=T)
# alternatively use optim()
theta.init <- c(1000,-0.5, 50)
exp.nll <- function(theta, data){
a <- theta[1]
b <- theta[2]
sigma <- theta[3]
obs.y <- data$y
x <- data$x
pred.y <- a*x^b
nll <- -sum(dnorm(x=obs.y, mean=pred.y , sd=sigma, log=T))
nll
}
fit.optim <- optim(par=theta.init,fn=exp.nll,method="BFGS",data=xy.frame )
plot(x=xy.frame$x,y=xy.frame$y)
# still looks too high
curve(a.est * x^b.est, add=T)
The reason you're seeing the unexpected behavior is that the curves that look "too high" actually have much lower sums of squared errors than the curves from excel:
# Fit from nls
sum((y - a.est*x^b.est)^2)
# [1] 1588.313
# Fit from excel
sum((y - 10.495*x^ -0.655)^2)
# [1] 1981.561
The reason nls favors the higher curve is that it is working to avoid huge errors at small x values at the cost of slightly larger errors with large x values. One way to address this might be to apply a log-log transformation:
mod <- lm(log(y)~log(x))
(a.est2 <- exp(coef(mod)["(Intercept)"]))
# (Intercept)
# 10.45614
(b.est2 <- coef(mod)["log(x)"])
# log(x)
# -0.6529741
These are quite close to the coefficients from excel, and yield a more visually appealing fit (despite the worse performance on the sum-of-squared-errors metric):

Resources