I have a sample data set
ID Depth Salinity Temperature Time fluorescence
1 0 1.3 29.2 13:44:23 152
2 3.1 1.4 29.2 13:44:26 175
3 3.5 2 29.2 13:44:30 149
4 4.3 2.6 29.2 13:44:34 192
5 7.5 2.9 29.4 13:44:37 174
6 8.2 2.1 29.1 13:44:41 154
7 10 2.6 29.1 13:44:44 147
8 9.1 2.6 29.1 13:44:48 150
9 7.3 2.7 28.9 13:44:52 147
10 5.2 3.2 29.0 13:44:55 180
11 4.5 2 29.0 13:44:59 167
12 3.3 2.3 29.1 13:45:03 154
13 2.5 1.8 29.1 13:45:06 106
14 0 1.5 29.1 13:45:10 136
I want two profiles Up and Down profile i.e. from depth 0-10 and 10-0 in a same plot. I used the code below to generate a plot
meltdf <- mutate(meltdf, trend = c(rep("UP",7), rep("DOWN",7)))
p <- ggplot(meltdf, aes(x = Temperature, y = Depth, color = trend)) +
geom_line()+
p
I get the plot with this. However, what I want is Depth in y axis and Salinity, Temperature, fluorescence in multiple x axis in the same graph. As they have varying ranges I don't know how i should set it.
Also the data i have is quite big and when i plot i dont get a smooth curve(pic R plot) in my result .Is there a way to avoid those spikes?
You might be looking for something like this
Your data
df <- structure(list(ID = 1:14, Depth = c(0, 3.1, 3.5, 4.3, 7.5, 8.2,
10, 9.1, 7.3, 5.2, 4.5, 3.3, 2.5, 0), Salinity = c(1.3, 1.4,
2, 2.6, 2.9, 2.1, 2.6, 2.6, 2.7, 3.2, 2, 2.3, 1.8, 1.5), Temperature = c(29.2,
29.2, 29.2, 29.2, 29.4, 29.1, 29.1, 29.1, 28.9, 29, 29, 29.1,
29.1, 29.1), Time = c("13:44:23", "13:44:26", "13:44:30", "13:44:34",
"13:44:37", "13:44:41", "13:44:44", "13:44:48", "13:44:52", "13:44:55",
"13:44:59", "13:45:03", "13:45:06", "13:45:10"), fluorescence = c(152L,
175L, 149L, 192L, 174L, 154L, 147L, 150L, 147L, 180L, 167L, 154L,
106L, 136L)), .Names = c("ID", "Depth", "Salinity", "Temperature",
"Time", "fluorescence"), row.names = c(NA, -14L), class = c("data.table",
"data.frame"))
library(tidyverse)
meltdf <- mutate(df, trend = c(rep("UP",7), rep("DOWN",7)))
solution
Starting with meltdf, gather relevant x-axis variables
moremelt <- meltdf %>%
gather(key, value, Salinity, Temperature, fluorescence)
ggplot with facet_wrap using options nrow=3 and scale="free"
ggplot(moremelt, aes(x = value, y = Depth, color = interaction(trend,key), label=key)) +
geom_line(lwd=2) +
scale_colour_manual(values=c("orange","red","blue","cyan","black","grey")) +
facet_wrap(~key, nrow=3, scale="free")
Related
data <- structure(list(A_w = c(0, 0.69, 1.41, 2.89, 6.42, 13.3, 25.5,
36.7, 44.3, 46.4), E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4,
4.4, 10.6, 16.5), A_e = c(0, 0.18, 0.37, 0.79, 1.93, 4.82, 11.4,
21.6, 31.1, 36.2), E_e = c(99.4, 99.3, 98.9, 98.4, 97.1, 93.3,
84.7, 71.5, 58.1, 48.7)), row.names = c(NA, -10L), class = "data.frame")
data
#> A_w E_w A_e E_e
#> 1 0.00 1.2 0.00 99.4
#> 2 0.69 1.2 0.18 99.3
#> 3 1.41 1.5 0.37 98.9
#> 4 2.89 1.6 0.79 98.4
#> 5 6.42 1.9 1.93 97.1
#> 6 13.30 2.3 4.82 93.3
#> 7 25.50 3.4 11.40 84.7
#> 8 36.70 4.4 21.60 71.5
#> 9 44.30 10.6 31.10 58.1
#> 10 46.40 16.5 36.20 48.7
Created on 2021-05-31 by the reprex package (v2.0.0)
I am trying to plot this data with all A values as X and Es as Y. How can I put either a) both of these columns plotted on a ggplot2, or b) rearrange this dataframe to combine the A columns and E columns into a final dataframe with only two columns with 2x as many rows as pictured?
Thanks for any help, I am a beginner (obviously)
Edit for Clarity: It's important that the A_e & E_e values remain as pairs, similar to how the A_w and E_w values remain as pairs. The end result plot should resemble the ORANGE and BLUE lines of this image, but I am trying to replicate this while learning R.
Currently I am capable of plotting each separately when dividing into two dataframes of 2x10
A_w E_w
1 0.00 1.2
2 0.69 1.2
3 1.41 1.5
4 2.89 1.6
5 6.42 1.9
6 13.30 2.3
7 25.50 3.4
8 36.70 4.4
9 44.30 10.6
10 46.40 16.5
and the second plot
# A tibble: 10 x 2
A_e E_e
<dbl> <dbl>
1 0 99.4
2 0.18 99.3
3 0.37 98.9
4 0.79 98.4
5 1.93 97.1
6 4.82 93.3
7 11.4 84.7
8 21.6 71.5
9 31.1 58.1
10 36.2 48.7
But my end goal is to have them both on the same plot, like in the Excel graph (orange + blue graph) above.
Here is a try
library(dplyr)
library(ggplot2)
line_1_data <- data %>%
select(A_w, E_w) %>%
mutate(xend = lead(A_w), yend = lead(E_w)) %>%
filter(!is.na(xend))
line_2_data <- data %>%
select(A_e, E_e) %>%
mutate(xend = lead(A_e), yend = lead(E_e)) %>%
filter(!is.na(xend))
# multiple column for with different geom
ggplot(data = data) +
# The blue line
geom_point(aes(x = A_w, y = E_w), color = "blue") +
geom_curve(data = line_1_data, aes(x = A_w, y = E_w, xend = xend,
yend = yend), color = "blue",
curvature = 0.02) +
# The orange line
geom_point(aes(x = A_e, y = E_e), color = "orange") +
geom_curve(data = line_2_data,
aes(x = A_e, y = E_e, xend = xend, yend = yend), color = "orange",
curvature = -0.02) +
# The red connection between two line
geom_curve(data = tail(data, 1),
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0.1,
color = "red") +
# The black straight line between pair
geom_curve(
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0,
color = "black")
Created on 2021-05-31 by the reprex package (v2.0.0)
You may try from this
data <- data.frame(
A_w = c(0,0.69,1.41,2.89,6.42,
13.3,25.5,36.7,44.3,46.4),
E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4, 4.4, 10.6, 16.5),
A_e = c(0,0.18,0.37,0.79,1.93,
4.82,11.4,21.6,31.1,36.2),
E_e = c(99.4,99.3,98.9,98.4,
97.1,93.3,84.7,71.4,58.1,48.7)
)
library(tidyverse)
data %>% pivot_longer(everything(), names_sep = '_', names_to = c('.value', 'type')) %>%
ggplot(aes(x = A, y = E, color = type)) +
geom_point() +
geom_line()
Created on 2021-05-31 by the reprex package (v2.0.0)
Doing it "by hand":
#dummmy data:
df = data.frame(A_w=rnorm(10), E_w=rnorm(10), A_e=rnorm(10), E_e=rnorm(10))
df2 = data.frame(A=c(df$A_w, df$A_e), E=c(df$E_w, df$A_e))
Output:
> df2
A E
1 1.25522468 -0.2441768
2 -0.50585191 -0.1383637
3 0.42374270 -0.9664189
4 -0.39858532 -0.3442157
5 -1.05665363 -1.3574362
6 0.79191788 -0.8202841
7 -1.31349592 0.7280619
8 -0.05609851 0.6365495
9 1.01068811 2.0222241
10 -1.15572972 -0.2190794
11 0.15579931 0.1557993
12 1.58834329 1.5883433
13 1.24933622 1.2493362
14 -0.28197439 -0.2819744
15 0.30593184 0.3059318
16 0.75486103 0.7548610
17 1.19394302 1.1939430
18 -1.79955846 -1.7995585
19 0.59688655 0.5968865
20 0.71519048 0.7151905
And for the plot: ggplot(df2, aes(x=A, y=E)) + geom_point()
Output:
There are ways to do this without having to joint the columns by listing their names - with the tidyr package - but i think that this solution is easier to understand from a beginners pov.
Trying to run a OLS regression model in R.
data = read.csv("C:/.../VOLATILITY.csv")
head(data)
volt LfquantBS HfquantBS LfbankVOL HfbankMM HfnonbankMM HfindMM
1 18.23 3.7 9.2 3.2 2.6 35.3 7.9
2 16.09 4.1 11.4 3.2 2.7 35.3 8.2
3 16.79 4.1 11.4 3.2 2.7 35.3 8.2
4 17.01 4.1 11.4 3.2 2.7 35.3 8.2
5 16.09 4.1 11.4 3.2 2.7 35.3 8.2
6 19.66 6.2 10.5 4.2 1.8 30.7 8.6
model <- lm(volt ~ lfquantBS + HfquantBs + LfbankVOL + HfbankMM + HfnonbankMM
+ HfindMM)
Error in eval(predvars, data, env) : object 'volt' not found
Have done this before without any problem. Any help appreciated.
It should have the data because the columns volt, lfquantBS, etc. exist only within the frame of the data.frame object named 'data'. In addition, case is important. In the formula, there is lfquantBS while in the dataset, it is named as LfQuantBS
lm(volt ~ LfquantBS + HfquantBS + LfbankVOL + HfbankMM +
HfnonbankMM + HfindMM, data = data)
-output
Call:
lm(formula = volt ~ LfquantBS + HfquantBS + LfbankVOL + HfbankMM +
HfnonbankMM + HfindMM, data = data)
Coefficients:
(Intercept) LfquantBS HfquantBS LfbankVOL HfbankMM HfnonbankMM HfindMM
23.2866 1.0846 -0.9858 NA NA NA NA
Regarding the comment Have done this before without any problem. It is possible that the OP may have attach(data) in the past to create those columns as objects in the global env or have created those as vector objects first before constructing the data.frame
data
data <- structure(list(volt = c(18.23, 16.09, 16.79, 17.01, 16.09, 19.66
), LfquantBS = c(3.7, 4.1, 4.1, 4.1, 4.1, 6.2), HfquantBS = c(9.2,
11.4, 11.4, 11.4, 11.4, 10.5), LfbankVOL = c(3.2, 3.2, 3.2, 3.2,
3.2, 4.2), HfbankMM = c(2.6, 2.7, 2.7, 2.7, 2.7, 1.8), HfnonbankMM = c(35.3,
35.3, 35.3, 35.3, 35.3, 30.7), HfindMM = c(7.9, 8.2, 8.2, 8.2,
8.2, 8.6)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6"))
I want to calculate the correlation between 'y' column and each column in 'col_df' dataframe.
For each calculation I want to save only the columns name with significant p_value (p_value<0.05).
y is a vector 64X1 of 0 and 1.
Example of the col_df- 60X12000
a b c d e
7.6 4.9 8.9 6.0 4.2
25.0 6.5 4.6 13.2 3.0
col_df <- as.matrix(df)
test <- col_df[, apply(col_df, MARGIN = 2, FUN = function(x)
(cor.test(y, col_df[,x], method = "pearson")$p.value <0.05))]
This is the error:
Error in col_df[, x] : subscript out of bounds
Is this the way to do that?
This is a working solution:
df <- structure(list(a = c(7.6, 7.6, 25, 25, 25, 25, 7.6, 7.6, 7.6, 25),
b = c(4.9, 4.9, 6.5, 6.5, 4.9, 6.5, 4.9, 4.9, 6.5, 6.5),
c = c(8.9, 4.6, 8.9, 8.9, 8.9, 4.6, 4.6, 8.9, 8.9, 4.6),
d = c(13.2, 13.2, 6, 6, 6, 6, 6, 13.2, 13.2, 13.2),
e = c(3, 4.2, 3, 4.2, 3, 3, 3, 4.2, 4.2, 4.2)),
class = "data.frame", row.names = c(NA, -10L))
y <- c(1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L)
test <- df[, apply(df, MARGIN = 2, FUN = function(x)
(cor.test(y, x, method = "pearson")$p.value < 0.05))]
test
#> a b
#> 1 7.6 4.9
#> 2 7.6 4.9
#> 3 25.0 6.5
#> 4 25.0 6.5
#> 5 25.0 4.9
#> 6 25.0 6.5
#> 7 7.6 4.9
#> 8 7.6 4.9
#> 9 7.6 6.5
#> 10 25.0 6.5
The difference to your solution ist that apply() gives you the column as x and
not an index. Hence, all you have to do is replace col_df[,x] of your solution with
just x.
You can simplify it a little with sapply(). I also recommend not to put everything into
a single line. It is hard to read and harder to debug.
Columns <- sapply(df, FUN = function(x) (cor.test(y, x, method = "pearson")$p.value < 0.05))
test <- df[, Columns]
test
#> a b
#> 1 7.6 4.9
#> 2 7.6 4.9
#> 3 25.0 6.5
#> 4 25.0 6.5
#> 5 25.0 4.9
#> 6 25.0 6.5
#> 7 7.6 4.9
#> 8 7.6 4.9
#> 9 7.6 6.5
#> 10 25.0 6.5
Created on 2020-07-22 by the reprex package (v0.3.0)
Is there a way in R to carry out an ANOVA test from a table of data that looks as follows:
Trees Avg_number_1m Avg_number_2m Avg_number_3m Avg_number_4m
1 Tree_1 15.2 15.0 15.2 12.0
2 Tree_2 16.2 15.4 14.2 15.4
3 Tree_3 14.4 9.2 3.2 1.6
4 Tree_4 14.6 5.6 10.4 9.2
5 Tree_5 15.2 13.0 7.4 3.0
6 Tree_6 14.0 12.0 13.0 11.2
7 Tree_7 13.8 7.8 7.2 2.0
8 Tree_8 10.8 5.8 4.4 2.4
9 Tree_9 12.4 9.6 6.8 2.6
10 Tree_10 15.6 11.0 7.2 1.8
11 Tree_11 7.6 7.4 9.0 1.8
12 Tree_12 13.8 7.8 7.2 2.0
13 Tree_13 10.8 5.8 4.4 1.6
14 Tree_14 15.2 15.0 15.2 12.0
15 Tree_15 16.2 15.4 14.2 15.0
16 Tree_16 12.4 9.2 3.2 1.6
17 Tree_17 14.6 5.6 10.4 9.2
18 Tree_18 15.2 13.0 7.4 3.0
19 Tree_19 14.0 14.4 13.2 13.8
20 Tree_20 11.0 5.2 4.4 0.8
I've tried to find tutorials on how to do this but the fact that the aov command requires one x and one y variable has been throwing me off. Any help is much appreciated.
So this is your data:
x = structure(list(Trees = structure(c(1L, 12L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label = c("Tree_1",
"Tree_10", "Tree_11", "Tree_12", "Tree_13", "Tree_14", "Tree_15",
"Tree_16", "Tree_17", "Tree_18", "Tree_19", "Tree_2", "Tree_20",
"Tree_3", "Tree_4", "Tree_5", "Tree_6", "Tree_7", "Tree_8", "Tree_9"
), class = "factor"), Avg_number_1m = c(15.2, 16.2, 14.4, 14.6,
15.2, 14, 13.8, 10.8, 12.4, 15.6, 7.6, 13.8, 10.8, 15.2, 16.2,
12.4, 14.6, 15.2, 14, 11), Avg_number_2m = c(15, 15.4, 9.2, 5.6,
13, 12, 7.8, 5.8, 9.6, 11, 7.4, 7.8, 5.8, 15, 15.4, 9.2, 5.6,
13, 14.4, 5.2), Avg_number_3m = c(15.2, 14.2, 3.2, 10.4, 7.4,
13, 7.2, 4.4, 6.8, 7.2, 9, 7.2, 4.4, 15.2, 14.2, 3.2, 10.4, 7.4,
13.2, 4.4), Avg_number_4m = c(12, 15.4, 1.6, 9.2, 3, 11.2, 2,
2.4, 2.6, 1.8, 1.8, 2, 1.6, 12, 15, 1.6, 9.2, 3, 13.8, 0.8)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"))
We can very quickly visualize your data using boxplot, and it shows that there are fewer spines at greater heights:
So we load a few libraries to get the data in the correct shape:
library(ggplot2)
library(tidyr)
# first we make it a "long" format
df = pivot_longer(x,-Trees,names_to="Height_levels")
Now we visualize for each individual tree how it looks like:
ggplot(df,aes(x=Height_levels,y=value,col=Trees)) + geom_point() +
geom_line(aes(group=Trees)) + theme(legend.position="top")
These tells us two things, we need to adjust the Tree, and then test when there are differences between the heights, the most straightfoward is to use an anova to test:
aovfit = aov(value ~ Trees + Height_levels,data=df)
summary(aovfit)
Df Sum Sq Mean Sq F value Pr(>F)
Trees 19 877.9 46.20 7.692 8.98e-10 ***
Height_levels 3 588.9 196.31 32.682 2.02e-12 ***
Residuals 57 342.4 6.01
And post-hoc with Tukey:
posthoc = TukeyHSD(aovfit)
posthoc$Height_levels
diff lwr upr p adj
Avg_number_2m-Avg_number_1m -3.49 -5.54109 -1.4389103 1.930647e-04
Avg_number_3m-Avg_number_1m -4.77 -6.82109 -2.7189103 4.752523e-07
Avg_number_4m-Avg_number_1m -7.55 -9.60109 -5.4989103 1.182687e-11
Avg_number_3m-Avg_number_2m -1.28 -3.33109 0.7710897 3.586375e-01
Avg_number_4m-Avg_number_2m -4.06 -6.11109 -2.0089103 1.429319e-05
Avg_number_4m-Avg_number_3m -2.78 -4.83109 -0.7289103 3.779450e-03
If you would like, you can also fit a linear model, where the height is a continuous variable, and test it with an anova:
df$Height = as.numeric(gsub("[^0-9]","",as.character(df$Height_levels)))
aov_continuous = aov(value ~ Trees + Height,data=df)
summary(aov_continuous)
Df Sum Sq Mean Sq F value Pr(>F)
Trees 19 877.9 46.2 7.601 7.74e-10 ***
Height 1 572.6 572.6 94.199 7.78e-14 ***
Residuals 59 358.7 6.1
And coefficients tell you how much lesser spines on average you get, by going up 1 m. In this case, it's about -2.39..
aov_continuous$coefficients
[...]
Height
-2.393000e+00
I am trying to bootstrap my data , sample of it is below
AveOn AveOff AveLd DWELL_SEC
0.3 0.1 5.9 14
0.3 0.1 5.9 17
0.3 0.1 5.9 9
1.1 1.5 25.3 21
1.1 1.5 25.3 159
1.1 1.5 25.3 14
1.1 1.5 25.3 13
1.1 1.5 25.3 18
1.1 1.5 25.3 26
1.1 1.5 25.3 19
1.1 1.5 25.3 17
1.1 1.5 25.3 24
1.1 1.5 25.3 27
I wrote the following code
library(xlsx)
library(bootstrap)
rawData <- read.xlsx("9660.xlsx")
load<-function(AveLd,AveOff,AveOn,DWELL_SEC)sum((AveLd-AveOff)+AveOn)
bootstrap(rawData,load,10000,replace=true)
I kept Getting this Error
Error in n * nboot : non-numeric argument to binary operator
is there a way to solve it
appreciated your time and help
You are messing up the arguments...
bootstrap(rawData, func=load, nboot=10000, replace=TRUE)
For further information have a look at the function help
?bootstrap
This is your data:
rawData = structure(list(AveOn = c(0.3, 0.3, 0.3, 1.1, 1.1, 1.1, 1.1, 1.1,
1.1, 1.1, 1.1, 1.1, 1.1), AveOff = c(0.1, 0.1, 0.1, 1.5, 1.5,
1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5), AveLd = c(5.9, 5.9,
5.9, 25.3, 25.3, 25.3, 25.3, 25.3, 25.3, 25.3, 25.3, 25.3, 25.3
), DWELL_SEC = c(14L, 17L, 9L, 21L, 159L, 14L, 13L, 18L, 26L,
19L, 17L, 24L, 27L)), class = "data.frame", row.names = c(NA,
-13L))
If you want to use the package bootstrap, then you bootstrap the indices of the data frame, and provide the data frame as an additional argument:
func = function(x,data){
with(data[x,],sum((AveLd-AveOff)+AveOn))
}
bootstrap(1:nrow(rawData),nboot=1000,theta=func,data=rawData)
library(xlsx)
library(bootstrap)
rawData <- read.xlsx("C:\\Users\\TAQWA\\Downloads\\9660.xlsx",1)
#load<-function(AveLd,AveOff,AveOn,DWELL_SEC)
# + + sum((AveLd-AveOff)+AveOn)
#bootstrap(rawData,10000,load())
three_d_array <- array(0,dim = c(270, 6, 20))
for (i in 1:20){
candy = 1:nrow(rawData)
B=sample(candy,nrow(rawData) , replace=T)
a=rawData[B,]
three_d_array[,,i]=as.matrix(a)
}