I am working on a tv retail dataset in R and wanted to put steps I will need to use repeatedly into a function.
This includes checking the VIF and return it, run the STEP algorithm to determine the best model and then use the result of the STEP and display it.
The major issue is the error message
Error in eval(predvars, data, env) : object 'Hour' not found
which appears to appear in the step() call.
Regression <- function(data, dep_var, features) {
lin.null = lm(paste(dep_var,'~ 1', sep = ''), data= data)
lin.full = lm(paste(dep_var,'~', paste(features, collapse='+'), sep = ''), data = data)
vif(lin.full)
opt = step(lin.null, scope = list(lower = lin.null, upper = lin.full), direction = "forward")
step_opt = opt$call
stargazer(step_opt, type = 'text')
}
dep_var = 'imp'
feat = c('Hour', 'grp')
paste(dep_var,'~', paste(feat, collapse='+'), sep = '')
Regression(comb_a, 'imp', feat)
The final result should show me the VIF values for each variable and the stargazer output of the STEP optimized regression.
EDIT 1:
comb_a is the input data the regression should take
The dput() output follows down below:
# comb_a
structure(list(Day = structure(c(1483833600, 1483833600, 1483833600,
1483833600, 1483833600, 1483833600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Hour = c(0, 1, 6, 7, 8, 9), Model = c("Model A",
"Model A", "Model A", "Model A", "Model A", "Model A"), tv_count = c(5L,
8L, 4L, 9L, 11L, 8L), grp_abs = c(55500, 8308, 19026, 12184,
10141, 113225), grp = c(0.22, 0.03, 0.07, 0.05, 0.04, 0.45),
sum_duration = c(150, 240, 120, 270, 330, 240), grp_per_second = c(370,
34.6166666666667, 158.55, 45.1259259259259, 30.730303030303,
471.770833333333), hours_since = c(NA, 1, 5, 1, 1, 1), camp_count = c(2L,
2L, 2L, 2L, 3L, 4L), imp = c(528, 319, 97, 182, 327, 785),
clicks = c(28, 15, 6, 13, 29, 53), leads = c(0, 0, 0, 0,
0, 1)), .Names = c("Day", "Hour", "Model", "tv_count", "grp_abs",
"grp", "sum_duration", "grp_per_second", "hours_since", "camp_count",
"imp", "clicks", "leads"), row.names = c(NA, -6L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = c("Day", "Hour"), drop = TRUE, indices = list(
0L, 1L, 2L, 3L, 4L, 5L), group_sizes = c(1L, 1L, 1L, 1L,
1L, 1L), biggest_group_size = 1L, labels = structure(list(Day = structure(c(1483833600,
1483833600, 1483833600, 1483833600, 1483833600, 1483833600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), Hour = c(0, 1, 6, 7, 8, 9)), row.names = c(NA,
-6L), class = "data.frame", vars = c("Day", "Hour"), drop = TRUE, .Names = c("Day",
"Hour")))
desired output would be: (Numbers are just for representation)
> vif(lin.full)
Hour grp sum_duration grp_per_second hours_since camp_count
2.979362 4.981504 2.290328 3.279818 1.013725 1.110823
imp clicks
7.471457 9.244811
> stargazer(step_opt, type = 'text')
===============================================
Dependent variable:
---------------------------
leads
-----------------------------------------------
clicks 0.005***
(0.0004)
camp_count 0.040*
(0.024)
Constant -0.107
(0.098)
-----------------------------------------------
Observations 898
R2 0.181
Adjusted R2 0.179
Residual Std. Error 0.772 (df = 895)
F Statistic 98.901*** (df = 2; 895)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
Related
R output is portraying with all values under the nodes, help me clean this up
click here [1]: https://i.stack.imgur.com/LUxFf.png
I cannot add the data, however here is the code, I am new at this usually I used SAS and SAS Eminer
`enter code here
`install.packages("party")
install.packages("rpart")
install.packages("rattle")
install.packages("rpart.plot")
install.packages("RColorBrewer")
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fit <- rpart(FPD ~.
, data = smallfpd
, method = 'class'
,control = rpart.control(minsplit = 10
,minbucket = 10
,cp = 0.0001
,maxdepth = 5))
prp(fit, type=4, extra=101, under=FALSE, faclen=2)
text(fit,pretty)
fit
A whole bunch of 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L)
ordered = c(Store = FALSE, Mon = FALSE,
YYYY = FALSE, Cust_State = FALSE, Campaign = FALSE, CUST_TYPE = FALSE,
Req_Loan_Amt = FALSE, PaycheckAmt = FALSE, IncomeMonthly = FALSE,
Payroll = FALSE, PayPeriod = FALSE, AGE = FALSE, WithdrawReason = FALSE,
Fin_Charge = FALSE, APR = FALSE, LoyaltyLevel = FALSE))
output for dput(head(smallfpd))
structure(list(Store = c(1, 1, 1, 1, 1, 1), Mon = c(12, 12, 10,
1, 10, 10), YYYY = c(2021, 2021, 2021, 2022, 2021, 2021), FPD = c(0,
0, 1, 0, 0, 0), Cust_State = c("OH", "AR", "MN", "CA", "CA",
"FL"), Campaign = c("II10", "LYSC", "PMSE", "PYCA", "LMCA", "YSSC"
), CUST_TYPE = c("NEW", "NEW", "NEW", "NEW", "NEW", "NEW"), Req_Loan_Amt = c("$1,850.00",
"$3,000.00", "$4,000.00", "$2,400.00", "$2,400.00", "$4,000.00"
), PaycheckAmt = c("$9,999,999,999.00", "$190,000.00", "$41,787.00",
"$26,000.00", "$25,000.00", "$22,146.43"), IncomeMonthly = c("$9,999,999,999.00",
"$411,666.67", "$41,787.00", "$56,333.33", "$25,000.00", "$47,983.93"
), Payroll = c("D", "D", "D", "D", "D", "D"), PayPeriod = c("M",
"B", "M", "B", "M", "B"), AGE = c(63, 46, 38, 46, 72, 29), WithdrawReason = c("Void due to active loan in third party",
"Void due to active loan in third party", "Void due to active loan in third party",
"Void due to active loan in third party", "Void due to active loan in third party",
"Void due to active loan in third party"), Fin_Charge = c("$167.98",
"$273.70", "$610.96", "$225.10", "$478.61", "$375.16"), APR = c(221.9867,
221.6611, 222.2844, 218.4176, 221.116, 218.4194), LoyaltyLevel = c(0,
0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
I'm trying to add significance annotations to an errorbar plot with a factor x-axis and dodged groups within each level of the x-axis. It is a similar but NOT identical use case to this
My base errorbar plot is:
library(ggplot2)
library(dplyr)
pres_prob_pd = structure(list(x = structure(c(1, 1, 1, 2, 2, 2, 3, 3, 3), labels = c(`1` = 1,
`2` = 2, `3` = 3)), predicted = c(0.571584427222816, 0.712630712634987,
0.156061969566517, 0.0162388386564817, 0.0371877245103279, 0.0165022541901018,
0.131528946944238, 0.35927812866896, 0.0708662221985375), std.error = c(0.355802875027348,
0.471253661425626, 0.457109887762665, 0.352871728451576, 0.442646879181155,
0.425913568532558, 0.376552208691762, 0.48178172708116, 0.451758041335245
), conf.low = c(0.399141779923204, 0.496138837620712, 0.0701919316506831,
0.00819832576725402, 0.0159620304815404, 0.00722904089045731,
0.0675129352870401, 0.17905347369819, 0.030504893442457), conf.high = c(0.728233665534388,
0.861980236164486, 0.311759350126477, 0.031911364587827, 0.0842227723261319,
0.0372248587668487, 0.240584344249407, 0.590437963881823, 0.156035177669385
), group = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain",
"neutral", "uncertain"), class = "factor"), group_col = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain", "neutral",
"uncertain"), class = "factor"), language = structure(c(2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("english", "dutch", "german"
), class = "factor"), top = c(0.861980236164486, 0.861980236164486,
0.861980236164486, 0.0842227723261319, 0.0842227723261319, 0.0842227723261319,
0.590437963881823, 0.590437963881823, 0.590437963881823)), row.names = c(NA,
-9L), groups = structure(list(language = structure(1:3, .Label = c("english",
"dutch", "german"), class = "factor"), .rows = structure(list(
4:6, 1:3, 7:9), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#dodge
pd = position_dodge(.75)
#plot
p = ggplot(pres_prob_pd,aes(x=language,y=predicted,color=group,shape=group)) +
geom_point(position=pd,size=2) +
geom_errorbar(aes(ymax=conf.high,ymin=conf.low),width=.125,position=pd)
p
What I want to do is annotate the plot such that the contrasts between group within each level of language are annotated for significance. I've plotted points representing the relevant contrasts and (toy) sig. annotations as follows:
#bump function
f = function(x){
v = c()
bump=0.025
constant = 0
for(i in x){
v = c(v,i+constant+bump)
bump = bump + 0.075
}
v
}
#create contrasts
combs = data.frame(gtools::combinations(3, 2, v=c("certain", "neutral", "uncertain"), set=F, repeats.allowed=F)) %>%
mutate(contrast=c("cont_1","cont_2","cont_3"))
combs = rbind(combs %>% mutate(language = 'english'),
combs %>% mutate(language='dutch'),
combs %>% mutate(language = "german")) %>%
left_join(select(pres_prob_pd,language:top)%>%distinct(),by='language') %>%
group_by(language)
#long transform and calc y_pos
combs_long = mutate(combs,y_pos=f(top)) %>% gather(long, probability, X1:X2, factor_key=TRUE) %>% mutate(language=factor(language,levels=c("english","dutch","german"))) %>%
arrange(language,contrast)
#back to wide
combs_wide =combs_long %>% spread(long,probability)
combs_wide$p = rep(c('***',"*","ns"),3)
#plot
p +
geom_point(data=combs_long,
aes(x = language,
color=probability,
shape=probability,
y=y_pos),
inherit.aes = T,
position=pd,
size=2) +
geom_text(data=combs_wide,
aes(x=language,
label=p,
y=y_pos+.025,
group=X1),
color='black',
position=position_dodge(.75),
inherit.aes = F)
What I am failing to achieve is plotting a line connecting each of the contrasts of group within each level of language, as is standard when annotating significant group-wise differences. Any help much appreciated!
getting a "subscript out of bounds" error in describeBy summary stat. when grouping by two variables and mat=TRUE for dataset > 1000 obs.
It Works ok for large dataset if mat=TRUE is removed and then it is a list output.
Using psych version 1.8.12.
Error in [<-(tmp, var, group + 1, value = dim.names[[group]][[groupi]]) :
subscript out of bounds error
any help is appreciated
"data" has 120951 observations, 6 variables.
prod_metrics <- describeBy(list(data$TOTALCHARGES, data$NONCOVEREDCHARGES,
data$COVEREDCHARGES, data$HCE_TARGET_AMT),
list(data$PARENTCLIENTCODE, data$RECEIVEDDATE), mat = TRUE, digits = 2)
Here is the structure of data:
dput(head(data))
structure(list(TOTALCHARGES = c(216, 496.68, 150, 610, 6259.73, 1020.97), PARENTCLIENTCODE = structure(c(468L, 253L, 456L, 456L, 43L, 167L), .Label = c("5STAR-P", "AAE-P", "AALT-P", "ABA-P",
"ABN-P", "ACN-P","XIND-P","XKAI-P","XOXF-P","XPACI-P","XPIC-P","XRX-P","XSFH-P","XSISTERP"), class = "factor"), RECEIVEDDATE = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("2019-02-14","2019-02-15","2019-02-16","2019-02-17","2019-02-18","2019-02-19","2019-02-20","2019-02-21",
"2019-04-03","2019-04-04","2019-04-05"), class = "factor"), NONCOVEREDCHARGES = c(0, 0, 0, 0, 0, 0), COVEREDCHARGES = c(216,
496.68, 150, 610, 6259.73, 1020.97), HCE_TARGET_AMT = c(216,496.68, 150, 610, 6259.73, 1020.97)),.Names = c("TOTALCHARGES",
"PARENTCLIENTCODE", "RECEIVEDDATE", "NONCOVEREDCHARGES", "COVEREDCHARGES",
"HCE_TARGET_AMT"), row.names = c(NA, 6L), class = "data.frame")
The code is scraping a website for stock data and returns a 1x18 dataframe for each stock. I am trying to convert the dataframe into a vector without turning the numeric columns into factors which is what is happening. I have also attempted to try and turn the dataframe into a matrix, but the numeric columns are still being converted into factors. In conclusion, I would like to keep characters as characters and numeric as numeric all in a vector. Thank you.
#get.dates is a function I created to scrape
data = get.dates("AAPL")
class(data)
[1] "data.frame"
class(data$surprise)
[1] "numeric"
dput(data)
structure(list(date = "2019-05-07T00:00:00", company = "Apple",
ticker = "AAPL", periodEnding = "Mar 2019", eps = "2.37",
reportedEPS = NA_character_, lastEps = "2.73", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 177.34,
numBuys = 17L, numHolds = 18L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 176.88,
numBuys = 14L, numHolds = 14L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 827573630900, sector = 18731L, stockId = 7624L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L))
data = unlist(data)
class(data)
[1] "character"
So the final output is to rbind each of the outputs into a single data.frame.
I think I have to convert each 1x18 dataframe into a vector to rbind because I am getting an error when trying to rbind the columns using the foreach package.
tickers = c("AAPL", "PEP", "KO")
system.time({
data = foreach(r = tickers, .packages = c("jsonlite", "dplyr"), .combine = rbind) %dopar% {get.dates(r)}
})
error calling combine function:
<simpleError in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed>
user system elapsed
0.02 0.00 0.56
Warning message:
non-unique value when setting 'row.names': ‘1’
print(data)
NULL
#I will do the same thing outside of the foreach loop to give some more context
data = lapply(tickers, get.dates)
do.call(rbind, data)
Error in `.rowNamesDF<-`(x, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘1’
dput(data)
list(structure(list(date = "2019-05-07T00:00:00", company = "Apple",
ticker = "AAPL", periodEnding = "Mar 2019", eps = "2.37",
reportedEPS = NA_character_, lastEps = "2.73", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 177.34,
numBuys = 17L, numHolds = 18L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 176.88,
numBuys = 14L, numHolds = 14L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 827573630900, sector = 18731L, stockId = 7624L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = "2019-04-23T00:00:00", company = "Coca-Cola",
ticker = "KO", periodEnding = "Mar 2019", eps = "0.46", reportedEPS = NA_character_,
lastEps = "0.47", consensus = 4L, bpConsensus = 5L, ratingsAndPT = structure(list(
priceTarget = 50.89, numBuys = 4L, numHolds = 5L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 51.25,
numBuys = 3L, numHolds = 1L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 193681840000, sector = 18731L, stockId = 8359L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = "2019-04-25T00:00:00", company = "PepsiCo",
ticker = "PEP", periodEnding = "Mar 2019", eps = "0.92",
reportedEPS = NA_character_, lastEps = "0.96", consensus = 4L,
bpConsensus = 4L, ratingsAndPT = structure(list(priceTarget = 123.67,
numBuys = 4L, numHolds = 3L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), bpRatingsAndPT = structure(list(priceTarget = 126,
numBuys = 1L, numHolds = 1L, numSells = 0L), class = "data.frame", row.names = c(NA,
-1L)), marketCap = 163697620000, sector = 18731L, stockId = 10962L,
stockTypeId = 1L, surprise = NA_real_, timeOfDay = 4L, isConfirmed = FALSE), class = "data.frame", row.names = c(NA,
-1L)))
Here is what I would like the output to look like. Thank you!!
You basically have to do your own list flattening here, which is not desirable. It's easier to do this when you get the json data originally. https://rdrr.io/cran/jsonlite/man/flatten.html
The below solution users purrr but you can do it with a for-loop or apply functions if you prefer. There are two main ideas here:
1. Bind together the dataframe-type columns with the part of the dataframe that doesn't have any nested columns. In your example, we bind together 3 separate pieces: 1 original dataframe with df_cols removed, and the other two dataframe columns. You can do this with bind_cols. It helps to prepend the original column names to avoid duplicates.
2. Collapse all the rows together with rbind or the like.
flatten_df_cols <- function(df) {
df_cols <- map_lgl(df, is.data.frame)
imap_dfc(df[, df_cols], ~setNames(.x, paste0(.y, ".", names(.x)))) %>%
bind_cols(list(df[, !df_cols]), .)
}
map_dfr(data, flatten_df_cols)
Observations: 3
Variables: 24
$ date <chr> "2019-05-07T00:00:00", "2019-04...
$ company <chr> "Apple", "Coca-Cola", "PepsiCo"
$ ticker <chr> "AAPL", "KO", "PEP"
$ periodEnding <chr> "Mar 2019", "Mar 2019", "Mar 2019"
$ eps <chr> "2.37", "0.46", "0.92"
$ reportedEPS <chr> NA, NA, NA
$ lastEps <chr> "2.73", "0.47", "0.96"
$ consensus <int> 4, 4, 4
$ bpConsensus <int> 4, 5, 4
$ marketCap <dbl> 827573630900, 193681840000, 163...
$ sector <int> 18731, 18731, 18731
$ stockId <int> 7624, 8359, 10962
$ stockTypeId <int> 1, 1, 1
$ surprise <dbl> NA, NA, NA
$ timeOfDay <int> 4, 4, 4
$ isConfirmed <lgl> FALSE, FALSE, FALSE
$ ratingsAndPT.priceTarget <dbl> 177.34, 50.89, 123.67
$ ratingsAndPT.numBuys <int> 17, 4, 4
$ ratingsAndPT.numHolds <int> 18, 5, 3
$ ratingsAndPT.numSells <int> 0, 0, 0
$ bpRatingsAndPT.priceTarget <dbl> 176.88, 51.25, 126.00
$ bpRatingsAndPT.numBuys <int> 14, 3, 1
$ bpRatingsAndPT.numHolds <int> 14, 1, 1
$ bpRatingsAndPT.numSells <int> 0, 0, 0
I have the following data.
> dput(testdat)
structure(list(Type = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Saline",
"Compound1"), class = "factor"), Treatment = structure(c(1L,
2L, 3L, 4L, 6L, 5L), .Label = c(".0032uM", ".016uM", ".08uM",
".4uM", "2uM", "10uM"), class = "factor"), Peak = c(1071.28430020209,
1458.23366806524, 2714.49856342393, 3438.83453920159, 3938.86391759534,
2980.10159109856), Area1 = c(3312.99749863082, 4798.35142770291,
9044.21362002965, 11241.1497514069, 11575.3444645068, 9521.69011119236
), SS1 = c(781.759834505516, 1191.6273298958, 2180.02082601411,
2601.33855989239, 2492.11886600804, 2185.39715502702), Conc = c(0.0032,
0.016, 0.08, 0.4, 10, 2), logconc = c(-2.49485002168009, -1.79588001734408,
-1.09691001300806, -0.397940008672038, 1, 0.301029995663981),
Conc_nm = c(3.2, 16, 80, 400, 10000, 2000), logconc_nm = c(0.505149978319906,
1.20411998265592, 1.90308998699194, 2.60205999132796, 4,
3.30102999566398)), .Names = c("Type", "Treatment", "Peak",
"Area1", "SS1", "Conc", "logconc", "Conc_nm", "logconc_nm"), row.names = 2:7, class = "data.frame")
I've fitted the data (Peak) with a nls regression using the following code:
fit = nls(Peak ~ SSlogis(logconc_nm,Asym,xmid,scal),data=testdat)
This gives me a nice fit and I'm happy with it so I plot the dose response as follows:
m <- coef(fit)
vallog <- as.numeric(format((m[3]),dig=4))
val =round(10^val,2)
ggplot(data = testdat,aes(logconc_nm,Peak))+
geom_point()+
scale_x_log10(breaks=round(testdat$logconc_nm,2))+
geom_smooth(method = 'nls',
formula = y ~ SSfpl(x,A,B,xmid,scal),se=FALSE)+
geom_vline(color='red',xintercept = vallog,alpha=.5)+
geom_text(aes(x=vallog,y=max(Peak),label = paste0('EC50',val,'nM')),color='red')#,angle=90)
My Question is:
How can I add a big ol' red point on the blue line where the blue and red line meet. I'd like to replace the need for the red line with the red dot. I know i have to use geom_point but because it's a fitted line, i can't just say x=vallog can i?