How do I make segments (of my probabilities?) - r

I was wondering if there is a function which can help me with segmentation. Via mixtools (logisregmixEM), I got an optimum of 3 segments with corresponding size of 2.5%, 40.3% and 57.2%. I also got posterior probabilities. Is there any way how to create three separate segments with corresponding observations based on the probabilities, in which I end up with 3 segments with the above called sizes?
For what its worth some background information of my coefficients, and probabilities:
> dput(head(betas))
structure(list(comp1 = c(4.57, 0.08, 0.91, -0.11, 0.09, 0.07),
comp2 = c(2.04, -0.22, 0.19, 0.34, -0.34, -0.01), comp3 = c(0.88,
0.03, 0.42, -0.02, -0.17, -0.01)), row.names = c("beta.0",
"beta.1", "beta.2", "beta.3", "beta.4", "beta.5"), class = "data.frame")
> dput(head(posteriorp))
structure(c(0.06, 0.03, 0, 0.03, 0, 0, 0.61, 0.42, 0.07, 0.41,
0.31, 0.41, 0.33, 0.56, 0.93, 0.56, 0.69, 0.59), .Dim = c(6L,
3L), .Dimnames = list(NULL, c("comp.1", "comp.2", "comp.3")))

Related

How to change the a axis to a time series in ggplot2

I'm trying to replicate the graph provided at https://www.chicagofed.org/research/data/cfnai/current-data since I will be needing graphs for data sets soon that look like this. I'm almost there, I can't seem to figure out how to change the x axis to the dates when using ggplot2. Specifically, I would like to change it to the dates in the Date column. I tried about a dozen ways and nothing is working. The data for this graph is under indexes on the website. Here's my code and the graph where dataSet is the data from the website:
library(ggplot2)
library(reshape2)
library(tidyverse)
library(lubridate)
df = data.frame(time = index(dataSet), melt(as.data.frame(dataSet)))
df
str(df)
df$data1.Date = as.Date(as.character(df$data1.Date))
str(df)
replicaPlot1 = ggplot(df, aes(x = time, y = value)) +
geom_area(aes(colour = variable, fill = variable)) +
stat_summary(fun = sum, geom = "line", size = 0.4) +
labs(title = "Chicago Fed National Activity Index (CFNAI) Current Data")
replicaPlot1 + scale_x_continuous(name = "time", breaks = waiver(), labels = waiver(), limits =
df$data1.Date)
replicaPlot1
Any sort of help on this would be very much appreciated!
G:\BOS\Common\R-Projects\Graphs\Replica of Chicago Fed National Acitivty index (PCA)\dataSet
Not sure what's your intention with data.frame(time = index(dataSet), melt(as.data.frame(dataSet))). When I download the data and read via readxl::read_excel I got a nice tibble with a date(time) column which after reshaping via tidyr::pivot_longer could easily be plotted and by making use of scale_x_datetime has a nicely formatted date axis:
Using just the first 20 rows of data try this:
library(ggplot2)
library(readxl)
library(tidyr)
df <- pivot_longer(df, -Date, names_to = "variable")
ggplot(df, aes(x = Date, y = value)) +
geom_area(aes(colour = variable, fill = variable)) +
stat_summary(fun = sum, geom = "line", size = 0.4) +
labs(title = "Chicago Fed National Activity Index (CFNAI) Current Data") +
scale_x_datetime(name = "time")
#> Warning: Removed 4 rows containing non-finite values (stat_summary).
#> Warning: Removed 4 rows containing missing values (position_stack).
Created on 2021-01-28 by the reprex package (v1.0.0)
DATA
# Data downloaded from https://www.chicagofed.org/~/media/publications/cfnai/cfnai-data-series-xlsx.xlsx?la=en
# df <- readxl::read_excel("cfnai-data-series-xlsx.xlsx")
# dput(head(df, 20))
df <- structure(list(Date = structure(c(
-87004800, -84412800, -81734400,
-79142400, -76464000, -73785600, -71193600, -68515200, -65923200,
-63244800, -60566400, -58060800, -55382400, -52790400, -50112000,
-47520000, -44841600, -42163200, -39571200, -36892800
), tzone = "UTC", class = c(
"POSIXct",
"POSIXt"
)), P_I = c(
-0.26, 0.16, -0.43, -0.09, -0.19, 0.58, -0.05,
0.21, 0.51, 0.33, -0.1, 0.12, 0.07, 0.04, 0.35, 0.04, -0.1, 0.14,
0.05, 0.11
), EU_H = c(
-0.06, -0.09, 0.01, 0.04, 0.1, 0.22, -0.04,
0, 0.32, 0.16, -0.2, 0.34, 0.06, 0.17, 0.17, 0.07, 0.12, 0.12,
0.15, 0.18
), C_H = c(
-0.01, 0.01, -0.05, 0.08, -0.07, -0.01,
0.12, -0.11, 0.1, 0.15, -0.04, 0.04, 0.17, -0.03, 0.05, 0.08,
0.09, 0.05, -0.06, 0.09
), SO_I = c(
-0.01, -0.07, -0.08, 0.02,
-0.16, 0.22, -0.08, -0.07, 0.38, 0.34, -0.13, -0.1, 0.08, -0.07,
0.06, 0.07, 0.12, -0.3, 0.35, 0.14
), CFNAI = c(
-0.34, 0.02, -0.55,
0.04, -0.32, 1, -0.05, 0.03, 1.32, 0.97, -0.46, 0.39, 0.38, 0.11,
0.63, 0.25, 0.22, 0.01, 0.49, 0.52
), CFNAI_MA3 = c(
NA, NA, -0.29,
-0.17, -0.28, 0.24, 0.21, 0.33, 0.43, 0.77, 0.61, 0.3, 0.1, 0.29,
0.37, 0.33, 0.37, 0.16, 0.24, 0.34
), DIFFUSION = c(
NA, NA, -0.17,
-0.14, -0.21, 0.16, 0.11, 0.17, 0.2, 0.5, 0.41, 0.28, 0.2, 0.32,
0.36, 0.32, 0.33, 0.25, 0.31, 0.47
)), row.names = c(NA, -20L), class = c(
"tbl_df",
"tbl", "data.frame"
))

Plot in R with different pch's

This is my data, and I need to plot:
data=structure(c(0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09,
0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,
0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31,
0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42,
0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53,
0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64,
0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75,
0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86,
0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97,
0.98, 0.99, -4.29168871465397, -3.11699074587972, 1.09152409255126,
1.55755175826356, -0.172913268677486, 0.138305902738217, -0.38707713636532,
0.0638896647028127, 0.838910810102289, 0.943154102106711, 1.10825647675154,
1.26151733689579, 0.95610404139547, 1.13671597066802, 1.06145162449853,
1.22015975232484, 1.47211564748976, 1.43575780356999, 1.84397139393396,
1.76431139003358, 1.59262327273733, 1.74799121927712, 1.60092115463811,
1.91302749514369, 1.69691050471565, 1.73871696181996, 1.70008388736007,
1.62139419455853, 2.03803222390097, 1.95654400666235, 2.14213709053145,
2.20797610828818, 2.43019994960532, 2.43201814098108, 1.80396697393168,
2.22800019319471, 2.07590961781243, 1.93938306553876, 1.95940985069043,
2.01357121475676, 1.97530323680977, 1.80327169854223, 2.36734705989908,
2.44766094824079, 2.75792381459726, 2.77274665368527, 2.49888229303308,
2.31540449224314, 2.6409962540336, 2.43729957198807, 2.63155885389867,
2.53653088267223, 2.36871141172942, 2.54858578120089, 2.69802567434559,
3.09606341962321, 3.08856133175863, 3.18997559061186, 3.36005160648579,
3.56895022380044, 3.73753226001724, 3.74662085372188, 4.01296134301718,
4.07267448537225, 3.88165588983999, 3.7369314477271, 3.23912007937852,
3.31721703890831, 3.21894991022748, 3.48377059081018, 3.32624243338278,
3.31970136033168, 3.33053692253337, 3.34467916673038, 3.236168836409,
2.93429043790414, 2.9303837626847, 3.15769722112212, 3.75496410153913,
3.60526854720219, 3.82913260531081, 4.12105540857576, 4.00407286724511,
3.86329120505831, 4.01282715673454, 4.27078090625557, 3.57982245847814,
3.42938648057264, 3.04047099021105, 3.22396221972667, 4.4317374989557,
4.55399628631069, 4.51384672365535, 5.19575483872483, 4.77975901314362,
3.67143455937258, 4.83321942758713, 5.82353153779422, 5.4721995802281,
0.209205679527393, 0.36810747913542, 0.767214115569449, 0.631134464438132,
0.950471080949761, 0.955883872576242, 0.861939569072133, 0.978322788509546,
0.650739708163536, 0.609454620741533, 0.416316714902356, 0.424390227854642,
0.509471258981771, 0.45111061569788, 0.482703338045896, 0.415503380452312,
0.281397009944395, 0.312633722543431, 0.172403050166603, 0.157569155616774,
0.223315461391016, 0.134712102225702, 0.187843250166637, 0.109294406499708,
0.115163596824693, 0.138462578171918, 0.119131458337016, 0.174760537513378,
0.060100726330413, 0.0724953102167094, 0.0727020992861007, 0.0538763524104828,
0.0305519665256373, 0.0458544145004334, 0.13222239331969, 0.062914362547982,
0.0997526784831062, 0.11462977656091, 0.116582141802293, 0.0986337165111772,
0.136226138825677, 0.168342590268618, 0.0716128991576213, 0.0676036354494944,
0.0357838762803169, 0.0334279079582225, 0.0610644117339305, 0.0616823286482187,
0.0660736255131733, 0.104368782129991, 0.0705141118177286, 0.0778176025258217,
0.108146014569371, 0.125671355892738, 0.0590267483041353, 0.0294699796128093,
0.0338205013760269, 0.0269159737669502, 0.0134643988629253, 0.00867709725404753,
0.00493722923021656, 0.00323813401160211, 0.000497278521965683,
0.000424360028534299, 0.000603507667276793, 0.00192008642195063,
0.00578745302404915, 0.00632637091749721, 0.0036673526900235,
0.00322317560117313, 0.00315464572099522, 0.00890662685249866,
0.00630278028858244, 0.00172069402847441, 0.00297661131713389,
0.00907593497087, 0.00794661797866469, 0.00360198056893646, 0.000913572843050492,
0.000952621690864408, 0.000214234772719202, 4.55598611162067e-05,
2.0600933563486e-05, 0.00014372066333701, 3.00102200614383e-05,
1.97046007623936e-05, 0.000349337120439941, 0.00580915934418336,
0.0186446024343607, 0.0455194395151208, 0.0067650312952201, 0.00903110379061256,
0.0210099376843247, 0.0126330025977033, 0.0735408204027586, 0.158374400655879,
0.0970807294810527, 0.0643407704341705, 0.408677400389109), .Dim = c(99L,
3L), .Dimnames = list(NULL, c("betas.position", "coef", "pvalue"
)))
I need to plot a graph like this: plot(data[,1],data[,2], pch=8)
When the p-value (data[,3]) is bigger than 0.10, pch should be empty(a line).
I believe that I have to construct some rule, but I am not able to do this so far.
Use an ifelse, which returns a vector which here is either 1 or 2 depending on the value of data[,3]:
plot(data[,1],data[,2],pch=ifelse(data[,3]>0.10,1,2))
so pch=1 for data[,3]>0 and pch=2 otherwise. Adjust these for whichever symbols you want, or use NA for nothing. You can use similar logic for setting the symbol size with the cex= parameter.
The below will remove the points you don't want from your chart:
data <- as.data.frame(data)
plot(data[data$pvalue > 0.1,1],data[data$pvalue > 0.1,2], pch=8)
I'm not sure what you mean by "empty (a line)". If you want to overlay different plot types you should consider ggplot2. It has far more functionality than the Base R plots.

Understanding and implementing numerical integration with a quantile function in R

I need to calculate this integral below, using R:
The q_theta(x) function I managed to do in R with quantile regression (package: quantreg).
matrix=structure(c(0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09,
0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,
0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31,
0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42,
0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53,
0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64,
0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75,
0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86,
0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97,
0.98, 0.99, -22.2830664155772, -22.2830664155772, -19.9298291765612,
-18.2066426767652, -15.2657135034479, -14.921522915965, -13.5035945028536,
-13.1557269916064, -12.9495709618481, -11.6168348488161, -11.3999095021713,
-10.6962766764396, -10.0588239375837, -9.12944363439522, -8.15648778610587,
-8.04133299299019, -7.66558386420434, -7.50906566627427, -6.95626096568998,
-6.90630556403136, -6.53374879831376, -6.39324677042686, -6.20705804899049,
-6.09754765999465, -5.91272058217526, -5.75771166206242, -5.3770131257001,
-5.20892464393192, -5.07372162687422, -4.96706814289334, -4.64404095131293,
-4.1567394053577, -4.13209444755342, -3.85483644113723, -3.64855238293205,
-3.53054113507559, -3.46035383338799, -3.03155417364444, -2.93100183005178,
-2.90491824855193, -2.64056616049773, -2.51857727614607, -2.25163805172486,
-2.00934783937474, -1.89925824841417, -1.71405007411747, -1.65905834683964,
-1.47502511311988, -1.42755073292529, -1.20464216637298, -1.08574103345057,
-0.701134735371922, -0.590656010656201, -0.290335898959635, -0.0575062007348038,
0.0778328375033378, 0.165234593185889, 0.230651883848336, 0.316817885358695,
0.34841775605248, 0.516869604496075, 0.59743162507581, 0.857843937404964,
0.939734010162078, 1.12533017928147, 1.27037182428776, 1.52040854525927,
1.76577933448152, 2.07456447851822, 2.17389787235523, 2.27567786362425,
2.3850323163509, 2.55365596853891, 2.61208242890655, 2.77359226593771,
2.93275094039929, 3.07968072488942, 3.0822647851901, 3.26452177629061,
3.46223321951649, 3.66011832966054, 3.85710605543097, 4.05385887531972,
4.83943843494744, 5.05864734149161, 5.25501778319145, 5.38941130574907,
5.88571117751377, 6.5116611852713, 6.98632496342285, 7.21816245728101,
7.73244825971004, 7.80401007592906, 8.34648625541999, 9.83184090479964,
10.8324874884172, 11.3060100107816, 12.3048113953808, 13.1300123358331
), .Dim = c(99L, 2L), .Dimnames = list(NULL, c("Theta", "q(x)_(Theta)"
)))
This is my q_theta(x) function that I estimated in R. One of the question I have is:
a> If x is a standard normal distribution this integral is zero; Right?
b> Otherwise, in my case, the integral is not zero. How do I treat the q_1-Theta(x)? Its simply the sort(matrix[,"q(x)_(Theta)"],decreasing=TRUE) ?
And the integration would be:
sintegral(thau[1:50], (matrix[,"q(x)_(Theta)"][1:50] - sort(matrix[,"q(x)_(Theta)"],TRUE)[1:50])[1:50])$value
The median would be a comun point of this two functions. Right?
Thanks.
Recall your previous post Building a function by defining X and Y and then Integrating in R, we build a linear interpolation function
## note `rule = 2` to enable "extrapolation";
## otherwise `rule = 1` gives `NA` outside [0.01, 0.5]
integrand <- approxfun(mat[, 1], y, rule = 2)
Then we can perform numeric integration on [0, 0.5]:
integrate(integrand, lower = 0, upper = 0.5)
# -5.594405 with absolute error < 4e-04
Now for a>, let's have a proof first.
Note, your quantile function is not for normal distribution, so this result does not hold. You can actually verify this
quant <- approxfun(mat[, 1], mat[, 2], rule = 2)
integrate(quant, lower = 0, upper = 0.5)
# -3.737973 with absolute error < 0.00029
Compared with previous integration result -5.594405, the difference is not a factor of 2.

Building a function by defining X and Y and then Integrating in R

I need to construct a function with x values coming from the first column of this matrix below and y values coming from the second column from the same matrix, with the purpose of later calculating the integral in the desired range.:
matrix=structure(c(0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09,
0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,
0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31,
0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42,
0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53,
0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64,
0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75,
0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86,
0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97,
0.98, 0.99, -7.38512004893287, -7.38512004893287, -6.4788834441613,
-5.63088940915783, -4.83466644123448, -4.68738146949482, -4.28638930290018,
-4.22411786604579, -3.59136848943044, -3.51706359680799, -3.39972014575003,
-3.28609348968074, -3.08569873266253, -2.99764447889508, -2.89470597729108,
-2.77488515429677, -2.67019029728821, -2.54646363628509, -2.48474483938047,
-2.30542896070156, -2.22485510301423, -2.16689229344011, -2.10316315192181,
-2.05135466960309, -1.90942757945567, -1.87863626704201, -1.82507998490407,
-1.75875817642096, -1.6919717645629, -1.62396997031953, -1.56159595204983,
-1.52152738173419, -1.46478394989911, -1.4590555309334, -1.21744398902807,
-1.21731951113139, -1.15003007559406, -1.07321513324935, -0.993364510081357,
-0.924402354306976, -0.885939210442384, -0.831155619244629, -0.80947326709303,
-0.786842719842383, -0.743834513319968, -0.721194178931262, -0.593033922802471,
-0.514780082129033, -0.50717184901095, -0.44223827942003, -0.403514759789576,
-0.296251921664, -0.204238424399985, -0.1463212643028, -0.0982036017275267,
-0.0705262020944892, 0.0275436976821241, 0.0601977432996216,
0.114959963559268, 0.182222546319913, 0.236503724954577, 0.272244043950984,
0.325188234828891, 0.347862804414816, 0.438932719815686, 0.630570414177834,
0.805087251137292, 0.904903847087405, 0.940702374334727, 0.958351604371838,
1.03920208406121, 1.25808734990267, 1.32634708210007, 1.34458194173569,
1.42693337001189, 1.55016591141652, 1.5710754638668, 1.61795101580197,
1.62472416407376, 1.70223430572367, 1.86164374636379, 1.94317125269006,
2.03941620499986, 2.12071850455654, 2.17753890907921, 2.22227616630581,
2.45586794615095, 2.66160802425205, 2.83084956697756, 2.94669126521054,
3.04536994227142, 3.09217816201639, 3.42405058020625, 3.45140184734503,
3.67343579954061, 4.64233570345934, 4.87075743677502, 5.27924539262207,
5.56822483595709), .Dim = c(99L, 2L), .Dimnames = list(NULL,
c("x", "y")))
So i would have a function like this:
plot(matrix[,1],matrix[,2])
And then, my idea is to calculate the integral of this function using this code in R:
integrating= function(x) return(myfunction(x));
integrate(integrating, lower=0.08, upper=0.15)
Is it possible?
I tried but it didnt work.
When I looked at you provide matrix (better use variable mat not matrix for it), I found that your x samples are evenly spaced, and y values are monotone and smooth against x. So a simple linear interpolation would be sufficiently good to model those data.
## read `?approx`
f <- approxfun(mat[, 1], mat[, 2])
Then you can do
integrate (f, lower = 0.08, upper = 0.15)
# -0.2343698 with absolute error < 1.3e-05

Using apply using multiple sources of data?

I'm still in the beginning stages of R but I've gotten a few functions down and now I'm looking for my final "project."
I've created a function that takes each of my four sources of data (different populations) and creates histograms, performs kolmogorov-smirnov tests, and then graphs any significant results for a given row. What I want to do is turn it into an apply function. However, the issue is that my function takes four variables, and I don't know a way to make apply take four sources of data.
hist_fx <- function(w,x,y,z) {
hist(w,prob=TRUE,col="green",xlim=c(-1,1),ylim=c(0,3))
lines(density(w),col="red")
abline(v=c(mean(w)),col="red")
hist(x,prob=TRUE,col="blue",xlim=c(-1,1),ylim=c(0,3))
lines(density(x),col="red")
abline(v=c(mean(x)),col="red")
hist(y,prob=TRUE,col="yellow",xlim=c(-1,1),ylim=c(0,3))
lines(density(y),col="red")
abline(v=c(mean(y)),col="red")
hist(z,prob=TRUE,col="purple",xlim=c(-1,1),ylim=c(0,3))
lines(density(z),col="red")
abline(v=c(mean(z)),col="red")
all <- c(w,x,y,z)
hist(all,prob=TRUE,xlim=c(-1,0.5),ylim=c(0,3))
lines(density(w),col="purple")
lines(density(x),col="red")
lines(density(y),col="blue")
lines(density(z),col="green")
plot(ecdf(w),col="green")
plot(ecdf(x),col="blue",add=TRUE)
plot(ecdf(y),col="red",add=TRUE)
plot(ecdf(z),col="purple",add=TRUE)
t1 <- ks.test(w,x)
print(t1)
t2 <- ks.test(w,y)
print(t2)
t3 <- ks.test(w,z)
print(t3)
if(t1$p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(x),col="blue",add=TRUE)
}
if(t2p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(y),col="red",add=TRUE)
}
if(t3$p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(z),col="purple",add=TRUE)
}
}
I'm able to use this function with apply for one population at a time (i.e. turn hist_fx into a function of one variable). However, I can't find a way to make this work for all four populations at the same time. I've messed around with some for loops, though they haven't been successful as of yet.
One last thing that might be of use: my data is arranged such that independent variables are the rows and the dependent variables are columns. Consequently, I need to run these per row (hence my idea of a for loop).
EDIT:
Here's the dput for one of the populations:
dput(k2)
structure(c(-0.15, 0.13, 0.23, -0.23, 0.06, -0.11, 0.107, 0.06,
-0.17, 0.12, 0.06, -0.25, -0.32, 0.13, 0.06, -0.2, -0.08, 0.06,
0.12, 0.02, 0.11, -0.11, -0.15, 0.097, 0.347, -0.307, 0.097,
-0.047, 0.09, 0.01, -0.217, 0.117, 0.03, -0.3, -0.33, 0.13, 0.19,
-0.24, -0.08, -0.01, 0.15, 0.61, 0.18, -0.15, -0.103, 0.135,
0.31, -0.25, 0.157, -0.105, -0.08, 0.01, -0.165, 0.17, 0.1, -0.23,
-0.28, 0.15, 0.13, -0.14, -0.06, 0.01, 0.07, -0.02, 0.11, -0.06,
-0.123, 0.13, 0.35, -0.27, 0.165, -0.065, 0.135, 0.13, -0.17,
0.135, 0.08, -0.21, -0.25, 0.2, 0.16, -0.18, NA, -0.04, 0.05,
-0.02, 0.13, -0.14, -0.13, 0.098, 0.27, -0.193, 0.062, -0.08,
0.057, 0.028, -0.199, 0.1, 0.04, -0.24, -0.32, 0.13, 0.13, -0.15,
-0.05, 0.01, 0.08, -0.04, 0.1, -0.1, -0.14, 0.154, 0.261, -0.194,
0.1, -0.129, 0.063, 0.142, -0.136, 0.136, 0.08, -0.23, -0.24,
0.12, 0.1, -0.16, -0.06, 0.04, 0.09, -0.01, 0.04, -0.08, -0.127,
0.133, 0.337, -0.06, 0.11, -0.107, 0.16, 0.167, -0.183, 0.103,
0.05, -0.2, -0.3, 0.22, -0.01, -0.17, -0.14, 0.02, 0.07, 0.01,
0.11, -0.11, -0.155, 0.221, 0.22, -0.172, 0.09, -0.15, 0.12,
0.03, -0.153, 0.146, 0.11, -0.2, -0.24, 0.16, 0.07, -0.19, -0.1,
0.03, 0.17, 0.02, 0.09, -0.16, -0.062, 0.19, 0.269, -0.265, 0.118,
-0.11, 0.126, 0.094, -0.186, 0.151, 0.08, -0.26, -0.31, 0.13,
0.09, -0.23, -0.12, 0.05, 0.13, 0.01, 0.11, -0.14, -0.095, 0.14,
0.24, -0.46, 0.09, -0.17, 0.08, 0.01, -0.24, 0.16, 0.04, -0.38,
-0.39, 0.11, 0.06, -0.31, -0.25, 0.03, 0.21, -0.14, 0, -0.22,
-0.07, 0.148, 0.311, -0.27, 0.11, -0.055, 0.16, 0.04, -0.197,
0.064, 0.09, -0.24, -0.34, 0.17, 0.07, -0.15, -0.18, 0.03, 0.13,
0.07, 0.13, -0.08, -0.136, 0.142, 0.27, -0.257, 0.1, -0.13, 0.103,
0.064, -0.197, 0.118, 0.06, -0.29, -0.35, 0.13, 0.1, -0.19, -0.13,
0.01, 0.1, -0.01, 0.13, -0.15), .Dim = c(22L, 12L))
To further clarify, here's the format of the actual data frame:
c1 c2 c3 c4
r2 x x x
r3 x x x
r4 x x x
Each column represents a star's values for the variable on the row. As such, I want to create a histogram for each row, for each dataset.
For the values of the function, I just used those variables for simplicity's sake. w = population 1, x = population 2, y = population 3, z = population 4.
As for an example:
> hist_fx(k2[1,],n2[1,],j2[1,],g2[1,])
Two-sample Kolmogorov-Smirnov test
data: w and x
D = 1, p-value = 1.229e-05
alternative hypothesis: two-sided
Two-sample Kolmogorov-Smirnov test
data: w and y
D = 1, p-value = 1.229e-05
alternative hypothesis: two-sided
Two-sample Kolmogorov-Smirnov test
data: w and z
D = 1, p-value = 1.229e-05
alternative hypothesis: two-sided
My problem is that currently, I can only run the function one row at a time. I'd like to be able to do it for all rows. I was thinking of using apply because I've used it in a very similar context except only for one source of data.
Not quite sure of your needs but consider transposing, t() to run plots column-wise for row data. And consider using mapply(), the multivariate type of the apply family which runs an operation element-wise at the same time for equal-length objects. Even break apart the operations as running them together may only print/plot the last iteration to screen.
Transpose (data used were slight variations of posted dput matrix)
pop1 <- data.frame(t(data))
pop2 <- data.frame(t(data))
pop3 <- data.frame(t(data))
pop4 <- data.frame(t(data))
Histograms
hist_fx <- function(w,x,y,z) {
whist <- hist(w,prob=TRUE,col="green",xlim=c(-1,1),ylim=c(0,3))
lines(density(w),col="red")
abline(v=c(mean(w)),col="red")
xhist <- hist(x,prob=TRUE,col="blue",xlim=c(-1,1),ylim=c(0,3))
lines(density(x),col="red")
abline(v=c(mean(x)),col="red")
yhist <- hist(y,prob=TRUE,col="yellow",xlim=c(-1,1),ylim=c(0,3))
lines(density(y),col="red")
abline(v=c(mean(y)),col="red")
zhist <- hist(z,prob=TRUE,col="purple",xlim=c(-1,1),ylim=c(0,3))
lines(density(z),col="red")
abline(v=c(mean(z)),col="red")
}
# HISTOGRAM PLOTS FOR EACH DF COLUMN
output <- mapply(hist_fx, w=pop1, x=pop2, y=pop3, z=pop4)
Kolmogorov-Smirnov tests (using slight variations of dput data)
hist_fx <- function(w,x,y,z) {
t1 <- ks.test(w,x)
t2 <- ks.test(w,y)
t3 <- ks.test(w,z)
if(t1$p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(x),col="blue",add=TRUE)
}
if(t2$p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(y),col="red",add=TRUE)
}
if(t3$p.value < 0.05) {
plot(ecdf(w),col="green")
plot(ecdf(z),col="purple",add=TRUE)
}
return(c(t1, t2, t3))
}
output <- mapply(hist_fx, w=pop1, x=pop2, y=pop3, z=pop4)
output
# X1
# statistic 0.1666667
# p.value 0.9962552
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and x"
# statistic 0.25
# p.value 0.8474885
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and y"
# statistic 0.08333333
# p.value 1
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and z"
# X2
# statistic 0.25
# p.value 0.8474885
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and x"
# statistic 0.08333333
# p.value 1
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and y"
# statistic 0.1666667
# p.value 0.9962552
# alternative "two-sided"
# method "Two-sample Kolmogorov-Smirnov test"
# data.name "w and z"
# ...

Resources