I am having a complete brain fart right now. Why is the following code not plotting two lines for each category of the variable Ford? To my mind, I have the variable mapped as a grouping variable and then an aesthetic (col) in both geom_point() and geom_line(). I feel like I'm just overlooking something very basic.
#libraries
library(tidyverse)
#data
structure(list(stressx = c(0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1,
0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1), visiblex = c(0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1), ford = c(0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), preds = c(0.166275511711196,
0.25404479263251, 0.344473748733258, 0.432243029654572, 0.417891216538386,
0.449861131692899, 0.482799832155125, 0.514769747309638, 0.166275511711196,
0.25404479263251, 0.344473748733258, 0.432243029654572, 0.417891216538386,
0.449861131692899, 0.482799832155125, 0.514769747309638), se.fit =
c(0.0216850668407667,
0.0140669010411715, 0.014932848560481, 0.0233259879905658,
0.0546688696666978,
0.034867400606124, 0.0282122239553816, 0.0418803253364085,
0.0216850668407667,
0.0140669010411715, 0.014932848560481, 0.0233259879905658,
0.0546688696666978,
0.034867400606124, 0.0282122239553816, 0.0418803253364085)), .Names =
c("stressx",
"visiblex", "ford", "preds", "se.fit"), out.attrs = structure(list(
dim = structure(c(4L, 2L, 2L), .Names = c("stressx", "visiblex",
"ford")), dimnames = structure(list(stressx = c("stressx=0.0000000",
"stressx=0.3333333", "stressx=0.6666667", "stressx=1.0000000"
), visiblex = c("visiblex=0", "visiblex=1"), ford = c("ford=0",
"ford=1")), .Names = c("stressx", "visiblex", "ford"))), .Names = c("dim",
"dimnames")), row.names = c(NA, -16L), class = "data.frame")`
My plot
newdat %>%
mutate(visiblex=recode_factor(visiblex, `0`="Not Visible Minority",
`1`="Visible Minority"), ford=recode_factor(ford, `0`="Disapprove",
`1`="Approve"), stressx=recode_factor(stressx, `0`='Strongly disagree',
`0.33`='Somewhat disagree', `0.67`='Somewhat agree', `1`='Strongly agree'))
%>%
rename(Stress=stressx, Visible=visiblex, Ford=ford, Prob=preds) %>%
#filter(Ford=='Approve') %>%
ggplot(., aes(x=Stress, y=Prob, group=Ford))+
geom_point(aes(col=Ford))+
geom_line(aes(col=Ford))+
facet_wrap(~Visible)+
ylim(c(0,1))+
theme(axis.text.x=element_text(angle=45, vjust=0.5))`
It's because you have identical data points for both levels of the factor variable Ford. I have modified your code slightly to show the data and then plotted the data with geom_jitter instead of geom_point and now you can see both data points. Since the underlying datapoints are identical, the lines drawn through those data points are also overlapping and only one of them is visible.
#libraries
library(tidyverse)
#data
newdat <- structure(
list(
stressx = c(0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1,
0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1),
visiblex = c(0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1),
ford = c(0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
preds = c(
0.166275511711196,
0.25404479263251,
0.344473748733258,
0.432243029654572,
0.417891216538386,
0.449861131692899,
0.482799832155125,
0.514769747309638,
0.166275511711196,
0.25404479263251,
0.344473748733258,
0.432243029654572,
0.417891216538386,
0.449861131692899,
0.482799832155125,
0.514769747309638
),
se.fit =
c(
0.0216850668407667,
0.0140669010411715,
0.014932848560481,
0.0233259879905658,
0.0546688696666978,
0.034867400606124,
0.0282122239553816,
0.0418803253364085,
0.0216850668407667,
0.0140669010411715,
0.014932848560481,
0.0233259879905658,
0.0546688696666978,
0.034867400606124,
0.0282122239553816,
0.0418803253364085
)
),
.Names =
c("stressx",
"visiblex", "ford", "preds", "se.fit"),
out.attrs = structure(
list(
dim = structure(c(4L, 2L, 2L), .Names = c("stressx", "visiblex",
"ford")),
dimnames = structure(
list(
stressx = c(
"stressx=0.0000000",
"stressx=0.3333333",
"stressx=0.6666667",
"stressx=1.0000000"
),
visiblex = c("visiblex=0", "visiblex=1"),
ford = c("ford=0",
"ford=1")
),
.Names = c("stressx", "visiblex", "ford")
)
),
.Names = c("dim",
"dimnames")
),
row.names = c(NA, -16L),
class = "data.frame"
)
#my plot
data <- newdat %>%
mutate(
visiblex = recode_factor(visiblex, `0` = "Not Visible Minority",
`1` = "Visible Minority"),
ford = recode_factor(ford, `0` = "Disapprove",
`1` =
"Approve"),
stressx = recode_factor(
stressx,
`0` = 'Strongly disagree',
`0.33` =
'Somewhat disagree',
`0.67` = 'Somewhat agree',
`1` = 'Strongly agree'
)
) %>%
dplyr::rename(
Stress = stressx,
Visible = visiblex,
Ford = ford,
Prob = preds
)
# display data
data
#> Stress Visible Ford Prob se.fit
#> 1 Strongly disagree Not Visible Minority Disapprove 0.1662755 0.02168507
#> 2 Somewhat disagree Not Visible Minority Disapprove 0.2540448 0.01406690
#> 3 Somewhat agree Not Visible Minority Disapprove 0.3444737 0.01493285
#> 4 Strongly agree Not Visible Minority Disapprove 0.4322430 0.02332599
#> 5 Strongly disagree Visible Minority Disapprove 0.4178912 0.05466887
#> 6 Somewhat disagree Visible Minority Disapprove 0.4498611 0.03486740
#> 7 Somewhat agree Visible Minority Disapprove 0.4827998 0.02821222
#> 8 Strongly agree Visible Minority Disapprove 0.5147697 0.04188033
#> 9 Strongly disagree Not Visible Minority Approve 0.1662755 0.02168507
#> 10 Somewhat disagree Not Visible Minority Approve 0.2540448 0.01406690
#> 11 Somewhat agree Not Visible Minority Approve 0.3444737 0.01493285
#> 12 Strongly agree Not Visible Minority Approve 0.4322430 0.02332599
#> 13 Strongly disagree Visible Minority Approve 0.4178912 0.05466887
#> 14 Somewhat disagree Visible Minority Approve 0.4498611 0.03486740
#> 15 Somewhat agree Visible Minority Approve 0.4827998 0.02821222
#> 16 Strongly agree Visible Minority Approve 0.5147697 0.04188033
# plot the data
data %>%
#filter(Ford=='Approve') %>%
ggplot2::ggplot(data = .,
mapping = aes(x = Stress, y = Prob, group = Ford, colour = Ford)) +
ggplot2::geom_jitter() + # change this back geom_point()
ggplot2::geom_line() +
ggplot2::facet_wrap( ~ Visible) +
ggplot2::scale_y_continuous(limits = c(0, 1)) +
ggplot2::theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
Created on 2018-03-13 by the reprex package (v0.2.0).
Related
I'm trying to replicate the procedure proposed here on my data but I get the following error:
Error in interval.numeric(x, breaks = c(xmin - tol, ux, xmax)) :
invalid number of intervals
target is the categorical variable that I want to predict while I would force the first split of the classification tree to be done according to split.variable (categorical too). Due to the object characteristics, indeed, if split.variable is 1 target can be only 1, while if it is 0, target can be or 0 or 1.
Initially I treated them as factors but I changed them to numeric and then rounded (as suggested in other posts in SO). Unfortunately, none of these solutions were helpful.
I played a bit with the data, subsampling cols and rows but still it doesn't work.
What am I missing?
Here is an MRE to replicate the error:
library(partykit)
tdf = structure(list(target = c(0, 0, 0, 1, 0, 0, 1, 1, 1, 1), split.variable = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0), var1 = c(2.021, 1.882, 1.633, 3.917,
2.134, 1.496, 1.048, 1.552, 1.65, 3.112), var2 = c(97.979, 98.118,
98.367, 96.083, 97.866, 98.504, 98.952, 98.448, 98.35, 96.888
), var3 = c(1, 1, 1, 0.98, 1, 1, 1, 1, 1, 1), var4 = c(1, 1,
1, 0.98, 1, 1, 1, 1, 1, 1), var5 = c(18.028, 25.207, 20.788,
28.548, 18.854, 19.984, 27.352, 24.622, 25.037, 24.067), var6 = c(0.213,
0.244, 0.289, 0.26, 0.887, 0.575, 0.097, 0.054, 0.104, 0.096),
var7 = c(63.22, 59.845, 62.45, 63.48, 52.143, 51.256, 56.296,
57.494, 59.543, 68.434), var8 = c(0.748, 0.795, 0.807, 0.793,
0.901, 0.909, 0.611, 0.61, 0.618, 0.589)), row.names = c(6L,
7L, 8L, 9L, 11L, 12L, 15L, 16L, 17L, 18L), class = "data.frame")
tr1 <- ctree(target ~ split.variable, data = tdf, maxdepth = 1)
tr2 <- ctree(target ~ split.variable + ., data = tdf, subset = predict(tr1, type = "node") == 2)
Your data set is too small to do what you want:
With just 10 observations tr1 does not lead to any splits but produces a tree with a single root node.
Consequently, predict(tr1, type = "node") produces a vector of 10 times 1.
Thus, the subset with predict(tr1, type = "node") == 2 is empty (all FALSE).
This leads to an (admittedly cryptic) error message, reflecting that you cannot learn a tree from an empty data set.
Additionally: I'm not sure where you found the recommendation to use numeric codings of categorical variables. But for partykit you are almost always better off coding categorical variables appropriately as factor variables.
I have a dataset comprised of leaves which I've weighed individually in order of emergence (first emerged through final emergence), and I'd like to combine these masses so that I have the entire mass of all the leaves for each individual plant.
How would I add these up using R programming language, or what would I need to google to get started on figuring this out?
structure(list(Tray = c(1, 1, 1, 1, 1, 1), Plant = c(2, 2, 2,
2, 3, 3), Treatment = structure(c(4L, 4L, 4L, 4L, 4L, 4L), .Label = c("2TLH",
"E2TL", "EH", "WL"), class = "factor"), PreSwitch = c("Soil",
"Soil", "Soil", "Soil", "Soil", "Soil"), PostSwitch = c("Soil",
"Soil", "Soil", "Soil", "Soil", "Soil"), Pellet = c(1, 1, 1,
1, 1, 1), Rep = c(1, 1, 1, 1, 1, 1), Date = structure(c(1618963200,
1618963200, 1618963200, 1618963200, 1618963200, 1618963200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), DAP = c(60, 60, 60, 60, 60, 60), Position = c(2,
1, 3, 4, 4, 3), Whorl = structure(c(1L, 1L, 2L, 2L, 2L, 2L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), PetioleLength = c(1.229,
1.365, 1.713, 1.02, 0, 1.408), BladeLength = c(1.604, 1.755,
2.466, 2.672, 0.267, 2.662), BladeWidth = c(1.023, 1.185, 1.803,
1.805, 0.077, 1.771), BladeArea = c(1.289, 1.634, 3.492, 3.789,
0.016, 3.704), BladePerimeter = c(6.721, 7.812, 11.61, 12.958,
1.019, 14.863), BladeCircularity = c(0.359, 0.336, 0.326, 0.284,
0.196, 0.211), BPR = c(1.30512611879577, 1.28571428571429, 1.43957968476357,
2.61960784313725, NA, 1.890625), Leaf.Mass = c(9, 11, 31, 33,
32, 33), BladeAR = c(1.56793743890518, 1.48101265822785, 1.36772046589018,
1.4803324099723, 3.46753246753247, 1.50310559006211), Subirrigation = c(0,
0, 0, 0, 0, 0), Genotype = c(1, 1, 1, 1, 1, 1), Location = c(0,
0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
I may be missing something but isn't this a sum by Plant?
One solution below sums it for each plant into a separate table with just the totals and the second summarizes and adds it back to the main data set in a single step.
library(tidyverse)
#summary data set
plant_total <- df %>% group_by(Plant) %>% summarize(plant_weight = sum(Leaf.Mass, na.rm= TRUE))
#add plant_weight column to df data set
plant_total <- df %>% group_by(Plant) %>% mutate(plant_weight = sum(Leaf.Mass, na.rm = TRUE))
I am training a linear regression model predicting salary from company size (company_size_number) and country (country) using the StackOverflow data.
What I perform is:
Read the data. Split the data into a training set (75%) and a test set (25%).
Create a recipe that converts company_size_number into a factor variable and then transforms the two predictors into dummy variables.
Create the model specification.
Create a workflow object and add the recipe and model specification to it, then fit the model on the training set.
Calculate R² on the test set.
This is my code
library(tidyverse)
library(tidymodels)
so <- read_rds("stackoverflow.rds")
set.seed(123)
init_split <- initial_split(so)
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_num2factor(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
But not able to proceed due to an error:
Error: Please provide a character vector of appropriate length for `levels`.
I presume I am messing up something here in the spec_*()
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_novel(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
But not sure if this correct. Any inputs would be helpful.
> dput(head(so))
structure(list(country = structure(c(5L, 5L, 4L, 4L, 5L, 5L), .Label = c("Canada",
"Germany", "India", "United Kingdom", "United States"), class = "factor"),
salary = c(63750, 93000, 40625, 45000, 1e+05, 170000), years_coded_job = c(4L,
9L, 8L, 3L, 8L, 12L), open_source = c(0, 1, 1, 1, 0, 1),
hobby = c(1, 1, 1, 0, 1, 1), company_size_number = c(20,
1000, 10000, 1, 10, 100), remote = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Remote", "Not remote"), class = "factor"),
career_satisfaction = c(8L, 8L, 5L, 10L, 8L, 10L), data_scientist = c(0,
0, 1, 0, 0, 0), database_administrator = c(1, 0, 1, 0, 0,
0), desktop_applications_developer = c(1, 0, 1, 0, 0, 0),
developer_with_stats_math_background = c(0, 0, 0, 0, 0, 0
), dev_ops = c(0, 0, 0, 0, 0, 1), embedded_developer = c(0,
0, 0, 0, 0, 0), graphic_designer = c(0, 0, 0, 0, 0, 0), graphics_programming = c(0,
0, 0, 0, 0, 0), machine_learning_specialist = c(0, 0, 0,
0, 0, 0), mobile_developer = c(0, 1, 0, 0, 1, 0), quality_assurance_engineer = c(0,
0, 0, 0, 0, 0), systems_administrator = c(1, 0, 1, 0, 0,
1), web_developer = c(0, 0, 0, 1, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I have a couple of recommendations for adjustments in what you are doing.
The first is to do the selecting of variables before splitting, so that when you use a formula like salary ~ ., you and/or the functions don't get confused about what is there.
The second is to not use step_num2factor() in the way you have; it would take a lot to get it to work correctly and I think you're better served converting it to a factor before you split. Take a look at this step's documentation to see a more appropriate use for this recipe step, and notice that you have to give it levels. This is the reason you saw the error you did, but honestly I wouldn't try to find the right levels and input them there; I'd do it before splitting.
library(tidyverse)
library(tidymodels)
data("stackoverflow", package = "modeldata")
so <- janitor::clean_names(stackoverflow)
set.seed(123)
init_split <- so %>%
select(salary, company_size_number, country) %>%
mutate(company_size_number = factor(company_size_number)) %>%
initial_split()
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 27822.
Created on 2021-05-25 by the reprex package (v2.0.0)
I would like to plot an environmental variable on a ggplot2 version of a DCA plot.
I have some code where I extract species and data scores from vegan and then plot them up in ggplot2. I am having trouble trying to work out how I can get my environmental variable SWLI to plot as an arrow - something like this RDA's plots with ggvegan: How can I change text position for arrows text? (or see PCA example here https://www.rpubs.com/an-bui/vegan-cheat-sheet)
Can anybody help?
#DCA Plot
library(plyr)
library(vegan)
library(ggplot2)
library(cluster)
library(ggfortify)
library(factoextra)
#read in csv and remove variables you don't want to go through analysis
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
swli<-read_csv("DCAenv.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
regforamcountsall$Sample = NULL
regforamcountsall$Site=NULL
regforamcountsall$SWLI=NULL
#check csv
regforamcountsall
#run ordination
ord<-decorana(regforamcountsall)
#get species scores
summary(ord)
#get DCA values of environmental variable
ord.fit <- envfit(ord ~ SWLI, data=swli, perm=999)
ord.fit
plot(ord, dis="site")
plot(ord.fit)
#use this summary code to get species scores for DCA1 and DCA2
#put species scores values in from ord plot summary stats
species.scores<-read.csv("speciescores.csv")
species.scores$species <- row.names(species.scores)
#Using the scores function from vegan to extract the sample scores and convert to a data.frame
data.scores <- as.data.frame(scores(ord))
# create a column of groupings/clusters, from the rownames of data.scores
data.scores$endgroup <- as.factor(pam(regforamcountsall, 3)$clustering)
#getting the convex hull of each unique point set
find_hull <- function(df) df[chull(data.scores$DCA1, data.scores$DCA2), ]
hulls <- NULL
for(i in 1:length(unique(data.scores$endgroup))){
endgroup_coords <- data.scores[data.scores$endgroup == i,]
hull_coords <- data.frame(
endgroup_coords[chull(endgroup_coords[endgroup_coords$endgroup == i,]$DCA1,
endgroup_coords[endgroup_coords$endgroup == i,]$DCA2),])
hulls <- rbind(hulls,hull_coords)
}
data.scores$numbers <- 1:length(data.scores$endgroup)
regforamcountsall<-read_csv("regionalforamcountsallnocalcs.csv")
rownames(regforamcountsall)<-regforamcountsall$Sample
data.scores$Site<-regforamcountsall$Site
data.scores$SWLI<-regforamcountsall$SWLI
data.scores
#DCA with species
data.scores$Site <- as.character(data.scores$Site)
library(scico)
dca <- ggplot() +
# add the point markers
geom_point(data=data.scores,aes(x=DCA1,y=DCA2,colour=SWLI,pch=Site),size=4) + geom_point(data=species.scores,aes(x=DCA1,y=DCA2),size=3,pch=3,alpha=0.8,colour="grey22") +
# add the hulls and labels - numbers position labels
geom_polygon(data = hulls,aes(x=DCA1,y=DCA2,fill=endgroup), alpha = 0.25) +
#geom_text(data=data.scores,aes(x=DCA1-0.03,y=DCA2,colour=endgroup, label = numbers))+
geom_text(data=species.scores,aes(x=DCA1+0.1,y=DCA2+0.1, label = species))+
#look this up
geom_segment(data=ord.fit,aes(x = 0, y = 0, xend=DCA1,yend=DCA2), arrow = arrow(length = unit(0.3, "cm")))+
theme_classic()+
scale_color_scico(palette = "lapaz")+
coord_fixed()
dca
#regforamcountsall data
structure(list(Sample = c("T3LB7.008", "T3LB7.18", "T3LB7.303",
"WAP 0 ST-2", "T3LB7.5", "LG120"), T.salsa = c(86.63793102, 68.5897436,
70.39274924, 5.199999999, 79.15057916, 44.40000001), H.wilberti = c(0,
0, 0, 0, 0.386100386, 9.399999998), Textularia = c(0, 0, 0, 0,
0, 0.4), T.irregularis = c(2.155172414, 10.25641026, 7.854984897,
0, 2.702702703, 0), P.ipohalina = c(0, 0, 0, 0, 0, 0), J.macrescens = c(4.741379311,
5.769230769, 4.833836859, 5.800000001, 8.108108107, 5.400000001
), T.inflata = c(6.465517244, 15.38461538, 16.918429, 83.2, 5.791505794,
40.4), S.lobata = c(0, 0, 0, 2.300000001, 0, 0), M.fusca = c(0,
0, 0, 3.499999999, 3.861003862, 0), A.agglutinans = c(0, 0, 0,
0, 0, 0), A.exiguus = c(0, 0, 0, 0, 0, 0), A.subcatenulatus = c(0,
0, 0, 0, 0, 0), P.hyperhalina = c(0, 0, 0, 0, 0, 0), SWLI = c(200,
197.799175, 194.497937, 192.034776, 191.746905, 190.397351),
Site = c("LSP", "LSP", "LSP", "WAP", "LSP", "LG")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#data.scores
structure(list(DCA1 = c(-1.88587476921648, -1.58550534382589,
-1.59816311314591, -0.0851161831632892, -1.69080448670088, -1.14488987340879
), DCA2 = c(0.320139736602921, 0.226662031865046, 0.230912045301637,
-0.0531232712001122, 0.272143119753744, 0.0696939776869396),
DCA3 = c(-0.755595015095353, -0.721144380683279, -0.675071834919103,
0.402339366526422, -0.731006052784081, 0.00474996849420783
), DCA4 = c(-1.10780013276303, -0.924265835490466, -0.957711953532202,
-0.434438970032073, -0.957873836258657, -0.508347000558056
), endgroup = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), numbers = 1:6, Site = c("LSP",
"LSP", "LSP", "WAP", "LSP", "LG"), SWLI = c(200, 197.799175,
194.497937, 192.034776, 191.746905, 190.397351)), row.names = c(NA,
6L), class = "data.frame")
#species.scores
structure(list(species = c("1", "2", "3", "4", "5", "6"), DCA1 = c(-2.13,
-1.6996, -2.0172, -0.9689, 1.0372, -0.3224), DCA2 = c(0.342,
-0.8114, 0.3467, -0.3454, 2.0007, 0.9147)), row.names = c(NA,
6L), class = "data.frame")
I'm doing Propensity Score Matching and want to subset the data for treatment and control by using weights. There are 5 variables: ID, treatment(yes/No), Outcome(Yes/No), Age and "Weights". I was trying to write a programme in R, but have problems to do this according to weights. The survey package is used.
dput(dat2):
structure(list(ID = c(1, 2, 3, 4, 6, 7),
Weight = c(2.4740626, 2.4740626, 2.4740626, 2.4740626, 1.9548149, 1.9548149),
Age = c("35-44", "<15-24", "25-34", "35-44", ">45", "25-34"),
Treatment = c(1, 0, 0, 1, 0, 0),
Outcome = c(1, 1, 1, 0, 1, 1)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame")))
head(dat2):
data<-svydesign(ids = ~dat2$Id,
weights = ~dat2$Weight,
data = dat2)
treat<-subset(dat, dat2$treatment==1)
cont<-subset(dat, dat2$treatment==0)
I am sharing sample of data. I have 1587 rows. When I am finding dimensions without weights then the dimensions of treat and cont is 877*5 and 710*5 respectively. But with weights it will be 803*5 and 784*5.
Please help me.
Thanks in advance.
One way to do this is as below:
Sample Data
dat2 <- structure(list(ID = c(1, 2, 3, 4, 6, 7),
Weight = c(2.4740626, 2.4740626, 2.4740626, 2.4740626, 1.9548149, 1.9548149),
Age = c("35-44", "<15-24", "25-34", "35-44", ">45", "25-34"),
Treatment = c(1, 0, 0, 1, 0, 0),
Outcome = c(1, 1, 1, 0, 1, 1)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame"))
Script
data<-svydesign(ids = ~dat2$ID,
weights = ~dat2$Weight,
data = dat2)
treat<-subset(data, Treatment==1)
cont<-subset(data, Treatment==0)