I am hoping someone can help me.
I am performing Cramer's V tests on categorical data in R. Here's an example of the code:
#cramer's v
df1 <- subset(ACCIDENT_MASTER_single, select = c("SEVERITY", "ATMOSPH_COND"))
# Converting into numeric matrix
df3 <- data.matrix(df1)
#calculate Cramer's V
cramerV(df3)
I am using Shiny so that a user can select the categorical variables via dropdown menus and then the result of the Cramer's V is displayed. My code works, but interestingly, the results I am getting are completely different, even though I am using the same dataframe. Can anyone tell me why?
Here is an example of the R code using the Shiny package:
library(shinydashboard)
library(shiny)
library(dplyr)
library(DT)
library(rcompanion)
df <- data.frame(ACCIDENT_MASTER_single)
Cat1.Variables <- c("SEVERITY", "ATMOSPH_COND", "DAY_OF_WEEK")
Cat2.Variables <- c("SEVERITY", "ATMOSPH_COND", "DAY_OF_WEEK")
ui <- fluidPage(
titlePanel("Calculate the strength of the relationship between categorical variables"),
sidebarLayout(
sidebarPanel(
selectInput("cat1", choices = Cat1.Variables, label = "Select a Categorical Variable:"),
selectInput("cat2", choices = Cat2.Variables, label = "Select a Categorical Variable:")
),
mainPanel(
tableOutput("results")
)
)
)
server <- shinyServer(function(input, output) {
cramerdata <- reactive({
req(input$cat1, input$cat2)
df %>%
{
table(.[[input$cat1]], .[[input$cat2]])
}
})
output$results <- renderPrint({
cat(sprintf("\nThe results equal: \n"))
print(cramerV(cramerdata()))
})
})
shinyApp(ui, server)
Also, I have tested this on a number of different variables and all of my results are different, not just for the two variables in this example. Would love some help please!
EDIT: someone suggested I use dput(head(ACCIDENT_MASTER_single)) so a snippet of my results of that are found below (the dataset is very large). I hope this helps!
> dput(head(ACCIDENT_MASTER_single))
structure(list(ACCIDENT_NO = c("T20150000004", "T20150000017",
"T20150000020", "T20150000028", "T20150000034", "T20150000052"
), ACCIDENTDATE = c("2015-01-01", "2015-01-01", "2015-01-01",
"2015-01-01", "2015-01-01", "2015-01-01"), ACCIDENTTIME = c("02:10:00",
"07:20:00", "06:51:00", "07:55:00", "17:10:00", "01:20:00"),
ACCIDENT_TYPE = c(2L, 1L, 4L, 1L, 4L, 1L), DAY_OF_WEEK = c(5L,
5L, 5L, 4L, 5L, 5L), DCA_CODE = c(108L, 130L, 173L, 135L,
171L, 121L), DIRECTORY = c("MEL", "MEL", "MEL", "MEL", "MEL",
"MEL"), LIGHT_CONDITION = c(3L, 1L, 2L, 1L, 1L, 3L), ROAD_GEOMETRY = c(5L,
4L, 1L, 5L, 5L, 1L), SEVERITY = c(3L, 2L, 1L, 3L, 3L, 2L),
SPEED_ZONE = c(60L, 70L, 70L, 100L, 60L, 60L), ROAD_TYPE = c("ROAD",
"ROAD", "ROAD", "ROAD", "ROAD", "DRIVE"), ATMOSPH_COND = c("1",
"1", "1", "1", "1", "1"), ATMOSPH_COND_SEQ = c("1", "1",
"1", "0", "1", "1"), LGA_NAME = c("MOONEE VALLEY", "MONASH",
"BAYSIDE", "BRIMBANK", "MELTON", "BRIMBANK"), DEG_URBAN_NAME = c("MELB_URBAN",
"MELB_URBAN", "MELB_URBAN", "MELB_URBAN", "MELB_URBAN", "MELB_URBAN"
), Lat = c(-37.77922923, -37.88240078, -37.92909811, -37.76758102,
-37.72427767, -37.76316596), Long = c(144.9309415, 145.0903658,
145.0028103, 144.8002374, 144.7529804, 144.7897546), POSTCODE_NO = c(3032L,
3148L, 3186L, 3022L, 3023L, 3023L), Surface.Cond.Desc = c("Dry",
"Dry", "Dry", "Dry", "Dry", "Dry"), SURFACE_COND = c("1",
"1", "1", "1", "1", "1"), SURFACE_COND_SEQ = c("1", "1",
"1", "0", "1", "1"), ROAD_SURFACE_TYPE = c("1", "1,1", "1",
"1,1", "1", "1,1"), VEHICLE_TYPE = c("99", "5,2", "1", "1,62",
"1", "1,1"), TRAFFIC_CONTROL = c("0", "1,1", "0", "0,0",
"0", "1,1"), EVENT_TYPE = c("C", "C", "3,C", "C,3,C,3,C",
"3,C", "C"), SEX = c("M,U", "M,M", "M", "F,U", "M", "M,M,M,F"
), AGE = c("32,NA", "56,43", "28", "54,NA", "23", "17,16,19,41"
), Age.Group = c("30-39,unknown", "50-59,40-49", "26-29",
"50-59,unknown", "22-25", "16-17,16-17,17-21,40-49"), INJ_LEVEL = c("3,4",
"2,3", "1", "3,4", "3", "2,4,4,3"), ROAD_USER_TYPE = c("1,9",
"2,2", "2", "2,2", "2", "3,3,2,2")), row.names = c(NA, 6L
), class = "data.frame")
Thanks
The result is working for me... Try setting the seed also: set.seed(1)
cramerdata <- reactive({
req(input$cat1, input$cat2)
df3 <- data.matrix(ACCIDENT_MASTER_single[c(input$cat1, input$cat2)])
df3
})
output$results <- renderPrint({
cat(sprintf("\nThe results equal: \n"))
print(cramerV(cramerdata()))
})
Related
I am trying to split my data frame into 4 smaller data frames according to the vaccine used and the diagnosis.
Here is the loop I've been trying to use:
# Define loop
gene_of_interest <- '1'
vaccines <- c("A", "B")
diagnosis <- c("Sick", "Healthy")
for (v in vaccines)
{
for (d in diagnosis)
{
# Filter data
CDR3_post_challenge_plot_prep <- CDR3_post_challenge_plot_prep[CDR3_post_challenge_plot_prep$Vaccine == v & CDR3_post_challenge_plot_prep$Diagnosis == d & CDR3_post_challenge_plot_prep$gene == gene_of_interest ,]
assign(paste0("IgH_CDR3_COI_", v, "_", d), CDR3_post_challenge_plot_prep)
}
}
The only data frame with any observations outputted from this loop is the one that satisfies the first conditions, that is, "A_Sick". But I know there should be observations in the other 2 data frames.
Here is some of what the data frame looks like:
structure(list(gene = c("1", "1", "2", "3",
"1", "1"), abundance = c(27L, 15L, 33L, 20L, 20L,
69L), Timepoint2 = c("D0.12h", "D0.12h", "D0.12h", "D0.12h",
"D0.12h", "D0.12h"), Vaccine = structure(c(2L, 3L, 3L, 2L, 3L,
2L), .Label = c("Control", "B", "A"), class = "factor"),
Diagnosis = structure(c(2L, 1L, 2L, 2L, 1L, 1L), .Label = c("Healthy",
"Sick", "UNKNOWN - Not Challenged", "UNKNOWN - Treated prior to meeting diagnostic criteria"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
I have a list and I need to add together elements with different indexes. I'm struggling because I want to create a loop at different indexes.
data(aSAH)
rocobj <- roc(aSAH$outcome, aSAH$s100b)
dat<-coords(rocobj, "all", ret=c("threshold","sensitivity", "specificity"), as.list=TRUE)
I want to create a function where I can look at all the sensitivity/1-specificity combos at all thresholds in a new data frame. I know threshold is found in dat[1,], sensitivity is found in dat[2,] and specificity is found in dat[3,]. So I tried:
for (i in length(dat)) {
print(dat[1,i]
print(dat[2,i]/(1-dat[3,i]))
}
Where I should end up with a dataframe that has threshold and sensitivity/1-specificity.
DATA
dput(head(aSAH))
structure(list(gos6 = structure(c(5L, 5L, 5L, 5L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5"), class = c("ordered", "factor")), outcome = structure(c(1L,
1L, 1L, 1L, 2L, 2L), .Label = c("Good", "Poor"), class = "factor"),
gender = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("Male",
"Female"), class = "factor"), age = c(42L, 37L, 42L, 27L,
42L, 48L), wfns = structure(c(1L, 1L, 1L, 1L, 3L, 2L), .Label = c("1",
"2", "3", "4", "5"), class = c("ordered", "factor")), s100b = c(0.13,
0.14, 0.1, 0.04, 0.13, 0.1), ndka = c(3.01, 8.54, 8.09, 10.42,
17.4, 12.75)), .Names = c("gos6", "outcome", "gender", "age",
"wfns", "s100b", "ndka"), row.names = 29:34, class = "data.frame")
EDIT
One answer:
dat_transform <- as.data.frame(t(dat))
dat_transform <- dat_transform %>% mutate(new=sensitivity/(1-specificity))
You can use :
transform(t, res = sensitivity/(1-specificity))[c(1, 4)]
Or with dplyr :
library(dplyr)
t %>%
mutate(res = sensitivity/(1-specificity)) %>%
select(threshold, res)
Also note that t is a default function in R to tranpose dataframe so better to use some other variable name for the dataframe.
I have a dataset containing 120 observations of 6 variables. Five variables are factors, 1 variable is my target variable.
I need to write a function that will creates a matrix (for each factor) which contains each level of the factor as columns, and the maximum value of the target variable as first row, and the minimum value of the target variable as the second row.
I know how to create a matrix, however I am lost when I need to make it through a function.
Is there someone who can help?
Here is a simple example of what I want to reach with a fictive easy dataset.
Example
As you can see, for each level of the factor (on the picture factor 1), I want to indicate the highest value of the target, and the lowest value of the target.
Here is a subset of my own data:
> dput(data_plu[1:4, ])
structure(list(NaNO3 = structure(c(2L, 8L, 8L, 3L), .Label = c("10",
"14", "18", "2", "22", "26", "30", "6"), class = "factor"),
CaCl2 = structure(c(4L,
8L, 8L, 8L), .Label = c("0.1", "0.28", "0.46", "0.64", "0.82",
"1", "1.19", "1.37"), class = "factor"), PO4 = structure(c(1L,
5L, 5L, 6L), .Label = c("0.1", "0.8", "1.5", "2.2", "2.9", "3.6",
"4.3", "5"), class = "factor"), NH4Cl = structure(c(5L, 3L, 3L,
6L), .Label = c("0.5", "10.86", "12.93", "15", "2.58", "4.65",
"6.72", "8.79"), class = "factor"), MgSO4 = structure(c(4L, 7L,
1L, 7L), .Label = c("0.21", "0.35", "0.5", "0.64", "0.79", "0.93",
"1.08", "1.22"), class = "factor"), DC = c(15000L, 707500L, 720000L,
872500L)), row.names = c(NA, 4L), class = "data.frame")
You may be able to modify this to meet your needs. I wrote a function to handle one factor and then use lapply to handle them all. I've called your sample data dta:
stats <- function(x, y) {
minmax <- aggregate(y, list(x), range)
cols <- minmax[, 1]
result <- as.matrix(t(minmax[, -1]))
dimnames(result) <- list(c("Min", "Max"), Levels=as.character(cols))
return(result)
}
out <- lapply(dta[, -6], function(x) stats(x, dta$DC))
head(out, 1)
# $NaNO3
# Levels
# 14 18 6
# Min 15000 872500 707500
# Max 15000 872500 720000
I have produced two different plots based on two different models: model and model1. Please find enclosed My Data below. I have attached the two plots:
Model
Model1
I wish to merge the two plots and keep the confidence bands at the same time. I have tried several solution, e.g. rbind, but that does not seem to work - please see below.
I have used the following scripts to produce the two plots
model <- cph(Surv(os.neck,mors)~rcs(test),data=n)
model1 <- cph(Surv(os.neck,mors)~rcs(test),data=n1)
j <- ggplot(Predict(model, fun=exp), colfill = "blue")
k <- ggplot(Predict(model1, fun=exp), colfill = "yellow")
I have tried rbind:
e <- Predict(model, fun=exp, conf.int = TRUE)
f <- Predict(model1, fun=exp, conf.int = TRUE)
j <- ggplot(rbind(e,f))
Which gave this:
rbind()
My data:
n <- subset(w, w$stadie %in% 1:2)
n1 <- subset(w, w$stadie %in% 3:5)
The requested dput(out) from the comments
w <- structure(list(model = c("1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2"), test = c(0.0438735177865613, 0.0465676207122569,
0.0492617236379526, 0.0519558265636483, 0.0546499294893439, 0.0573440324150396,
0.0600381353407353, 0.062732238266431, 0.0654263411921266, 0.0681204441178223,
0.070814547043518, 0.0735086499692136, 0.0762027528949093, 0.078896855820605,
0.0815909587463007, 0.0842850616719963, 0.086979164597692, 0.0896732675233877,
0.0923673704490833, 0.095061473374779, 0.05, 0.0530569514237856,
0.0561139028475712, 0.0591708542713568, 0.0622278056951424, 0.065284757118928,
0.0683417085427136, 0.0713986599664992, 0.0744556113902848, 0.0775125628140703,
0.0805695142378559, 0.0836264656616415, 0.0866834170854271, 0.0897403685092127,
0.0927973199329983, 0.0958542713567839, 0.0989112227805695, 0.101968174204355,
0.105025125628141, 0.108082077051926), yhat = c(0.715524721809984,
0.72420520893997, 0.732895287854242, 0.741495950465592, 0.749903690905934,
0.758010700841758, 0.765705214141122, 0.772872009692537, 0.779393079520142,
0.785148467039571, 0.79001727733411, 0.793878857700365, 0.796614142441177,
0.798107151024956, 0.798246668871875, 0.796979824770716, 0.794412433838086,
0.790683064226291, 0.785933397797749, 0.780306386213083, 1.24887346414771,
1.12142387236568, 1.00744333341272, 0.906978784944319, 0.819807522848923,
0.745379660125369, 0.682977886151413, 0.631846830283734, 0.591296955987878,
0.560790614744859, 0.53975355731851, 0.52685030147002, 0.520878199524915,
0.520957917193064, 0.526437601275528, 0.53682068603444, 0.551708849922178,
0.570754454105439, 0.593618741429514, 0.619933518450193), lower = c(0.445870969928758,
0.472487603995491, 0.498645159577579, 0.523317755828918, 0.545270747924011,
0.563214260495099, 0.576107648755599, 0.583517928079882, 0.585795811114823,
0.583918701876133, 0.579131268180072, 0.572630973080174, 0.565412209767786,
0.558237952034289, 0.551671245622871, 0.546072898734981, 0.541548416151744,
0.538098574671309, 0.535672640626991, 0.534183860233478, 0.613882362074539,
0.611611984419279, 0.601234738035742, 0.579326232945668, 0.543582975437934,
0.496000647093785, 0.443637816386947, 0.39437687025085, 0.353159479619957,
0.321944706132161, 0.30083406381699, 0.288326373517578, 0.282948308375769,
0.283624310505754, 0.289563062775844, 0.300128054614955, 0.314709399887597,
0.332603569457389, 0.352917102130059, 0.374528152852913), upper = c(1.14825961332055,
1.11002527943736, 1.07718984661152, 1.05063556210888, 1.03133268706487,
1.02018052967182, 1.01769951541058, 1.02367230657634, 1.03697151956046,
1.05572593121937, 1.07769573631852, 1.10061046351294, 1.12235654089946,
1.14104571750444, 1.1550316414364, 1.16317224781343, 1.16534569433533,
1.16183119131315, 1.15311341092747, 1.13982862772903, 2.54069024589915,
2.05619172538896, 1.68809618910841, 1.4199434956646, 1.23639702655924,
1.1201413566373, 1.05144055745915, 1.01230687460364, 0.990011907755607,
0.976832690818709, 0.968420593537629, 0.962698059052612, 0.958882208194717,
0.956889594556209, 0.957085290437296, 0.96017831230139, 0.967186411308867,
0.979426190201882, 0.998487202942342, 1.02613799355416), .predictor. = c("test",
"test", "test", "test", "test", "test", "test", "test", "test",
"test", "test", "test", "test", "test", "test", "test", "test",
"test", "test", "test", "test", "test", "test", "test", "test",
"test", "test", "test", "test", "test", "test", "test", "test",
"test", "test", "test", "test", "test", "test", "test"), .set. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")), .Names = c("model",
"test", "yhat", "lower", "upper", ".predictor.", ".set."), row.names = c("1.1",
"1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9", "1.10",
"1.11", "1.12", "1.13", "1.14", "1.15", "1.16", "1.17", "1.18",
"1.19", "1.20", "2.201", "2.202", "2.203", "2.204", "2.205",
"2.206", "2.207", "2.208", "2.209", "2.210", "2.211", "2.212",
"2.213", "2.214", "2.215", "2.216", "2.217", "2.218", "2.219",
"2.220"), class = c("Predict", "data.frame"), info = structure(list(
Design = structure(list(label = structure("Set", .Names = ".set."),
units = structure("", .Names = ".set.")), .Names = c("label",
"units")), varying = ".set.", adjust = structure(list(`1` = NULL,
`2` = NULL), .Names = c("1", "2"))), .Names = c("Design",
"varying", "adjust")))
Thank you in advance,
C.
Here is a basic plot
ggplot(as.data.frame(out), aes(x = test)) +
geom_ribbon(aes(fill = model, ymin = lower, ymax = upper), alpha = .3) +
geom_line(aes(y = yhat, col = model))
We need as.data.frame(out) because out is of class Predict.
You could add another theme change fill and color or you might also want to add a meaningful title, subtitle etc. SO is full of examples.
We can use the JCO palette from the ggsci package
library(ggsci)
ggplot(as.data.frame(out), aes(x = test)) +
geom_ribbon(aes(fill = model, ymin = lower, ymax = upper), alpha = .3) +
geom_line(aes(y = yhat, col = model)) +
scale_color_jco() +
scale_fill_jco()
To change legend labels do
... +
scale_color_jco(labels = c("A", "B")) +
scale_fill_jco(labels = c("A", "B"))
I am trying to make a ggplot. When I had shape in aesthetics, the code was working just fine. However, I need to put shape in geom_point() because I'm trying to reproduce a figure. And when I added shape to geom_point() it gave me the following error:
Aesthetics must be either length 1 or the same as the data (6): shape
I've looked for other answers here but apparently, nothing seems to be working for me. Above I've provided with an image of what my data looks like. There are 17000 entries.
Below is my code:
summarised_data <-ddply(mammals,c('mammals$chr','mammals$Species','mammals$chrMark'),
function (x) c(median_rpkm = median(x$RPKM), median = median(x$dNdS)))
ggplot(summarised_data,aes(x = summarised_data$median_rpkm, y = summarised_data$median,
color = summarised_data$`mammals$Species`)) + geom_smooth(se = FALSE, method = "lm") +
geom_point(shape = summarised_data$`mammals$chrMark`) + xlab("median RPKM") + ylab("dNdS")
"ENSG00000213221", "ENSG00000213341", "ENSG00000213380", "ENSG00000213424",
"ENSG00000213533", "ENSG00000213551", "ENSG00000213619", "ENSG00000213626",
"ENSG00000213699", "ENSG00000213782", "ENSG00000213949", "ENSG00000214013",
"ENSG00000214338", "ENSG00000214357", "ENSG00000214367", "ENSG00000214517",
"ENSG00000214814", "ENSG00000215203", "ENSG00000215305", "ENSG00000215367",
"ENSG00000215440", "ENSG00000215897", "ENSG00000221947", "ENSG00000222011",
"ENSG00000224051", "ENSG00000225830", "ENSG00000225921", "ENSG00000239305",
"ENSG00000239474", "ENSG00000239900", "ENSG00000241058", "ENSG00000242247",
"ENSG00000242612", "ENSG00000243646", "ENSG00000244038", "ENSG00000244045"),
class = "factor"), Species = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Chimp", "Gori", "Human", "Maca",
"Mouse", "Oran"), class = "factor"), labs = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Chimp-A", "Chimp-X",
"Gori-A", "Gori-X", "Human-A", "Human-X", "Maca-A", "Maca-X",
"Mouse-A", "Mouse-X", "Oran-A", "Oran-X"), class = "factor"),
chrMark = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("A", "X"), class = "factor"), chr = structure(c(27L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"2", "20", "21", "22", "2a", "2A", "2b", "2B", "3", "4",
"5", "6", "7", "8", "9", "X"), class = "factor"), dN = c(3.00669,
3.27182, 7.02044, 1.01784, 3.0363, 2.32786, 4.92959, 3.03753,
3.0776, 1.02147), dS = c(3.15631, 5.87147, 3.13716, 2.05438,
4.10205, 5.24764, 4.2014, 3.18086, 5.4942, 3.02169), dNdS = c(0.9525965447,
0.5572403504, 2.2378329444, 0.4954487485, 0.7401908802, 0.4436013141,
1.1733207978, 0.954939859, 0.5601543446, 0.3380459279), RPKM = c(31.6,
13.9, 26.3, 9.02, 11.3, 137, 242, 1.05, 59.4, 10.1), Tau = c(0.7113820598,
0.8391023102, 0.3185943152, 0.6887167806, 0.9120531859, 0.6254200542,
0.7165302682, 0.7257435312, 0.2586613298, 0.6493567251),
GC3 = c(0.615502, 0.622543, 0.393064, 0.490141, 0.461592,
0.626407, 0.490305, 0.482853, 0.346424, 0.466484)), .Names = c("gene",
"Species", "labs", "chrMark", "chr", "dN", "dS", "dNdS", "RPKM",
"Tau", "GC3"), row.names = c(NA, 10L), class = "data.frame")
There's a few things wrong with your code and how ggplot handles non-standard evaluation, I'd recommend reading a ggplot tutorial or the docs. Having a column called within summarised_data called 'mammals$species' and 'mammals$chrMark' is going to cause lots of problems.
If we change these to something more sensible...
names(summarised_data)[names(summarised_data) == "mammals$species"] <- "mammals_species"
names(summarised_data)[names(summarised_data) == "mammals$chrMark"] <- "mammals_chrMark"
We can make the ggplot code more friendly. Note that shape has to been within aes, as you're mapping it to your data.
ggplot(summarised_data, aes(x = median_rpkm, y = median)) +
geom_smooth(se = FALSE, method = "lm") +
geom_point(aes(shape = mammals_chrMark,
color = mammals_species)) +
xlab("median RPKM") + ylab("dNdS")
Hopefully this should work, or at least get you somewhere closer to an answer.