I have the following dataset
set.seed(42)
cancer <- sample(c("yes", "no"), 200, replace=TRUE)
agegroup <- sample(c("35-39", "40-44", "45-49"), 200, replace=TRUE)
agefirstchild <- sample(c("Age < 30", "Age 30 or greater", "nullipareous"), 200, replace=TRUE)
dat <- data.frame(cancer, agegroup, agefirstchild)
And I am running this code to create a barchart. 2 questions.
1.I would now like to have the chart for the whole dataset not only cancer = yes
2. After I did run the library(plyr) I received a warning it wasn't working with a specific package.
Below plot was working, but after running this library not anymore. This is the error message: "Error in print.default(m, ..., quote = quote, right = right, max = max) :
invalid 'na.print' specification"
riskwoinvasivetrain%>%
group_by(Agegroup) %>%
summarize(prop_cancer = mean(Cancer == 'yes')) %>%
print(n=1000)
And just would like to have a simple frequency table telling me the size (n) of each subgroup. E.g size age 35-39 is
'data.frame': 159093 obs. of 12 variables:
$ Menopause : chr "Postmenopausal" "Postmenopausal" "Postmenopausal" "Postmenopausal" ...
$ Agegroup : chr "45-49" "45-49" "45-49" "45-49" ...
$ Density : chr "Almost entirely fat" "Almost entirely fat" "Almost entirely fat" "Almost entirely fat" ...
$ Race : chr "white" "white" "white" "white" ...
$ BMI : chr "10-24.99" "10-24.99" "10-24.99" "10-24.99" ...
$ AgeFirstBirth : chr "< 30" "< 30" "< 30" "< 30" ...
$ NumberRelativesCancer : chr "zero" "zero" "zero" "zero" ...
$ PreviousBreastProcedure: int 0 0 0 0 0 0 0 0 0 0 ...
$ LastMammogram : int 0 0 0 0 0 0 0 0 0 0 ...
$ SurgicalMenopause : int 0 0 0 0 0 0 0 0 0 0 ...
$ HRT : chr "no" "no" "no" "no" ...
$ Cancer : chr "no" "no" "no" "no" ...````
We can take the count, divide by the sum of 'n' for percentage and then do the plotting with ggplot
library(dplyr)
library(ggplot2)
dat %>%
count(agegroup, cancer) %>%
mutate(prop_cancer = n/sum(n)) %>%
ggplot(aes(x = agegroup, y = n, fill = cancer)) +
geom_col()
Related
I am learning R with the Fraud Transaction data. When I try to use ROSE to handle the imbalanced dataset, the only handle continuous and categorical variables error pops up.
Here's what I tried:
str(dataset)
'data.frame': 6362620 obs. of 13 variables:
$ step : int 1 1 1 1 1 1 1 1 1 1 ...
$ type : chr "PAYMENT" "PAYMENT" "TRANSFER" "CASH_OUT" ...
$ amount : num 9840 1864 181 181 11668 ...
$ nameOrig : chr "C1231006815" "C1666544295" "C1305486145" "C840083671" ...
$ oldbalanceOrg : num 170136 21249 181 181 41554 ...
$ newbalanceOrig : num 160296 19385 0 0 29886 ...
$ nameDest : chr "M1979787155" "M2044282225" "C553264065" "C38997010" ...
$ oldbalanceDest : num 0 0 0 21182 0 ...
$ newbalanceDest : num 0 0 0 0 0 ...
$ isFraud : int 0 0 1 1 0 0 0 0 0 0 ...
$ isFlaggedFraud : int 0 0 0 0 0 0 0 0 0 0 ...
$ balancedOfOrigin: num -9840 -1864 -181 -181 -11668 ...
$ balancedOfDest : num 0 0 0 21182 0 ...
datadata_ROSE <- ROSE(isFraud~., data = dataset, N = 500, seed = 111)$data
With Error:
Error in rose.sampl(n, N, p, ind.majo, majoY, ind.mino,
minoY, y, classy, : The current implementation of ROSE handles
only continuous and categorical variables.
Debugging:
# change the isFraud attribute into category 0/1
dataset$isFraud = as.factor(dataset$isFraud)
datadata_ROSE <- ROSE(isFraud~., data = dataset, N = 500, seed = 111)$data
At the end, the error still cannot be solved. How can I turn the dataset fit ROSE model?
As you can see in your str part, type, nameOrig, nameDest are still character not factor. It will work with change them to factors. But when I look at nameOrig and nameDest, it's not seems to be appropriate to included in ROSE.
dummy2 <- head(dataset, 100)
dummy2$isFraud = as.factor(dummy2$isFraud)
#additional part.
dummy2 <- dummy2 %>%
mutate(type = factor(type),
nameDest = factor(nameDest),
nameOrig = factor(nameOrig))
dummy3 <- ROSE(isFraud~., data = dummy2, N = 500, seed = 111)$data
A fully reproducible example. The problem is now the new columns created is in c() of all the values instead of each value being in it's individual row.
date = seq(as.Date("2019/01/01"), by = "month", length.out = 48)
productB = rep("B",48)
productB = rep("B",48)
productA = rep("A",48)
productA = rep("A",48)
subproducts1=rep("1",48)
subproducts2=rep("2",48)
subproductsx=rep("x",48)
subproductsy=rep("y",48)
b1 <- c(rnorm(30,5), rep(0,18))
b2 <- c(rnorm(30,5), rep(0,18))
b3 <-c(rnorm(30,5), rep(0,18))
b4 <- c(rnorm(30,5), rep(0,18))
Created the dataframe below
dfone <- data.frame("date"= rep(date,4),
"product"= c(rep(productB,2),rep(productA,2)),
"subproduct"=
c(subproducts1,subproducts2,subproductsx,subproductsy),
"actuals"= c(b1,b2,b3,b4))
export_df <- split(dfone[1:4], dfone[3])
# Creation of data frames based off UNIQUE SUBPRODUCTS
dummy_list <- split(dfone[1:4], dfone[3]) %>% lapply( function(x)
x[(names(x) %in% c("date", "actuals"))])
dummy_list <- lapply(dummy_list, function(x) { x["date"] <- NULL; x })
list_dfs <- list()
for (i in 1:length(unique(dfone$subproduct))) {
#assign(paste0("df", i), as.data.frame(dummy_list[[i]]))
list_dfs <-append(list_dfs,dummy_list[[i]])
}
combined_dfs <- Reduce(function(x, y) merge(x, y, all = TRUE,
by='date'), list(list_dfs))
Creating the time series
list_ts <- lapply(list_dfs, function(t)
ts(t,start=c(2019,1),end=c(2021,6), frequency = 12)) %>%
lapply( function(t) ts_split(t,sample.out=(0.2*length(t)))) #
creates my train test split
list_ts <- do.call("rbind", list_ts) #Creates a list of time series
Creating the models
model_tune_ses1 <- lapply(list_ts[1:(length(list_ts)/2)], function(x)
forecast::forecast(ses(x,h=24,alpha=0.1)))
model_tune_ses1 <- lapply(model_tune_ses1, "[", c("mean"))
model_tune_ses2 <- lapply(list_ts[1:(length(list_ts)/2)], function(x)
forecast::forecast(ses(x,h=24,alpha=0.2)))
model_tune_ses2 <- lapply(model_tune_ses2, "[", c("mean"))
model_trp_holt_mult <- lapply(list_ts[1:(length(list_ts)/2)],
function(x)
forecast::forecast( HoltWinters(x,seasonal="multiplicative"),h=24))
model_trp_holt_mult <- lapply(model_trp_holt_mult, "[", c("mean"))
lst1 <- do.call(Map, c(f = cbind, mget(ls(pattern = 'model_'))))
export_df1 <- Map(cbind, export_df, lst1)
export_df1 <- bind_rows(export_df1, .id = "date")
I've deleted the other edits to save a lot more space.
Edit2:
export_df1[[6]]
[[1]]
Jan Feb Mar Apr May Jun Jul Aug
Sep Oct Nov Dec
2021 6.508872 4.639274 4.678671 4.626766 5.327353 6.890269 6.640483
6.164311 5.317675 6.152747 5.963053 5.243159
2022 6.517052 4.645104 4.684550 4.632579 5.334046 6.898923 6.648823
6.172052 5.324353 6.160472 5.970540 5.249740
export_df1[[6]]
[[2]]
Jan Feb Mar Apr May Jun Jul Aug
Sep Oct Nov Dec
2021 6.508872 4.639274 4.678671 4.626766 5.327353 6.890269 6.640483
6.164311 5.317675 6.152747 5.963053 5.243159
2022 6.517052 4.645104 4.684550 4.632579 5.334046 6.898923 6.648823
6.172052 5.324353 6.160472 5.970540 5.249740
Instead of having each row in my dataframe as a list of all of those values, I want the column in the form like below.
export_df1$actuals
[1] 4.729682 4.573595 7.338069 4.742559 5.853501 3.167612 6.305137
5.879437 4.571004 5.367115 5.127305 4.552428 5.843000 4.060205 4.125869
4.190687
[17] 4.894595 6.454123 4.686262 4.196005 7.289879 6.206924 6.191610
6.100175 4.769656 4.829249 5.287280 4.425435 5.158180 4.402173 0.000000
0.000000 etc
We could get all the 'list_ts_ses' objects in a list
lst1 <- do.call(Map, c(f = cbind, mget(ls(pattern = 'list_ts_ses'))))
export_df1 <- Map(cbind, export_df, lst1)
Update
out <- Map(function(x, y) {x[colnames(y)] <- sapply(y,
function(u) c(rep(0, nrow(x) - length(u)), u)); x}, export_df, lst1)
-checking the structure
str(out)
List of 4
$ 1:'data.frame': 48 obs. of 7 variables:
..$ date : Date[1:48], format: "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" ...
..$ product : chr [1:48] "B" "B" "B" "B" ...
..$ subproduct : chr [1:48] "1" "1" "1" "1" ...
..$ actuals : num [1:48] 5.57 5.16 5.33 4.15 4.29 ...
..$ model_trp_holt_mult: num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses1 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses2 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
$ 2:'data.frame': 48 obs. of 7 variables:
..$ date : Date[1:48], format: "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" ...
..$ product : chr [1:48] "B" "B" "B" "B" ...
..$ subproduct : chr [1:48] "2" "2" "2" "2" ...
..$ actuals : num [1:48] 6.5 5.07 4.06 6.9 3.72 ...
..$ model_trp_holt_mult: num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses1 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses2 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
$ x:'data.frame': 48 obs. of 7 variables:
..$ date : Date[1:48], format: "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" ...
..$ product : chr [1:48] "A" "A" "A" "A" ...
..$ subproduct : chr [1:48] "x" "x" "x" "x" ...
..$ actuals : num [1:48] 5.21 6.51 2.42 4.8 3.62 ...
..$ model_trp_holt_mult: num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses1 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses2 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
$ y:'data.frame': 48 obs. of 7 variables:
..$ date : Date[1:48], format: "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" ...
..$ product : chr [1:48] "A" "A" "A" "A" ...
..$ subproduct : chr [1:48] "y" "y" "y" "y" ...
..$ actuals : num [1:48] 5.01 5.39 5.23 5.43 3.99 ...
..$ model_trp_holt_mult: num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses1 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
..$ model_tune_ses2 : num [1:48] 0 0 0 0 0 0 0 0 0 0 ...
data
export_df <- list(data.frame(col1 = 1:24, col2 = 25:48), data.frame(col1 = 1:24, col2 = 25:48), data.frame(col1 = 1:24, col2 = 25:48))
list_ts_ses <- list(ts(rnorm(24), frequency = 12, start = c(2021, 1)),ts(rnorm(24), frequency = 12, start = c(2021, 1)), ts(rnorm(24), frequency = 12, start = c(2021, 1)) )
list_ts_ses1 <- list(ts(rnorm(24), frequency = 12, start = c(2021, 1)),ts(rnorm(24), frequency = 12, start = c(2021, 1)), ts(rnorm(24), frequency = 12, start = c(2021, 1)) )
list_ts_ses2 <- list(ts(rnorm(24), frequency = 12, start = c(2021, 1)),ts(rnorm(24), frequency = 12, start = c(2021, 1)), ts(rnorm(24), frequency = 12, start = c(2021, 1)) )
I'm trying to convert a piped operator sequence into an R function or similar (ideally using Tidyverse).
The input is a tidy dataframe with the responses to 15 questions as variables and each of the 10 observations are one of 4 standard responses (e.g. agree, disagree, etc.)
The output should be a summary of responses with both a count and percentage of the distribution of responses/observations for each question/variable.
To avoid copying and pasting and to improve the code, I would like to wrap a function or similar to calculate the count and percentages in a loop, Purr map or similar to iterate over the 15 questions.
Thank you for your suggestions.
The below code works as expected and responds with a table of Question , Count and Percentage with values for Agree, etc. This is ultimately what I am trying to achieve on scale and elegantly.
DF %>%
select(question) %>%
group_by(question) %>%
summarise(Count = n()) %>%
mutate (Percentage = round(100 * Count / sum(Count),0))
Background
I start with a tidy dataframe:
*'data.frame': 10 obs. of 15 variables:
$ Question1 : Factor w/ 4 levels "Agree","Neither agree nor disagree",..: 1 1 1 1 1 1 1 1 1 1 ...*
The following gets me close, without the percentages:
DF_as_list <- DF %>%
map(summary)
by creating
*List of 15
$ Question1 : Named int [1:4] 10 0 0 0
..- attr(*, "names")= chr [1:4] "Agree" "Neither agree nor disagree" "Disagree" "Don't know"*
And the less helpful
> DF_from_list<- data.frame(matrix(unlist(DF_as_list),
> nrow=length(DF_as_list), byrow=T))
creates:
*'data.frame': 15 obs. of 4 variables:
$ X1: int 10 10 ...
$ X2: int 0 0 ...
$ X3: int 0 0 ...
$ X4: int 0 0 ...*
Finally,
DF_as_tibble <- as_tibble(DF_as_list)
produces a helpful summary tibble
*Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 4 obs. of 15 variables:
$ Question1 : int 10 0 0 0
$ Question2 : int 10 0 0 0*
and
DF_as_tibble %>%
map(summary)
produces useful summary statistics (min, median, mean, max, 1st and 3rd Qu) but not the percentage distribution of responses.
I have the following data structure: Meetings in Persons in Groups. The groups met differently often and the number of group members varied for every meeting.
$ GroupID : chr "1" "1" "1" "1" ...
$ groupnames : chr "A&M" "A&M" "A&M" "A&M" ...
$ MeetiID : chr "1" "1" "2" "2" ...
$ Date_Meetings : chr "43293" "43293" "43298" "43298" ...
$ PersonID : num 171 185 171 185 185 113 135 113 135 113 ...
$ v_165 : chr "3" "3" "4" "3" ...
$ v_166 : chr "2" "2" "3" "3" ...
$ v_167 : chr "2" "4" "4" "3" ...
$ v_168 : chr "6" "7" "4" "5" ...
$ problemtypes_categories: chr "Knowledgeproblem" "Knowledgeproblem" "Motivationalproblem" "Coordinationproblem" ...
$ v_165_dicho : num 0 0 0 0 1 1 1 0 0 1 ...
$ v_166_dicho : num 0 0 0 0 0 0 0 0 0 0 ...
$ v_167_dicho : num 0 0 0 0 1 1 0 0 0 0 ...
Now I have to create a new variable that should be binary (0/1) with the name agreement_levels. So, every time, a person in one group has - regarding the same learning meeting - a same problem type category than the other learner(s) of the same group at the same meeting, both learners (or three or four, depending on the group size for a respective meeting) should get the value 1 at the agreement variable, else they should all get 0. Whenever a person (e.g., among four learners) already has a different category of problem than the others, there is a 0 on the agreement variable for all.
If only 1 person is in the data set for one and the same meeting, there must be a NA at agree. When one person has NA at the problemtype variable, however, and there are 2 people in the data set for the same meeting, both get 0 at agree; but if there are 4 people for the same meeting in the data set and one of them has NA at problemtype, then only this person but not the others get NA at agree.
I did already write a command, but it is not working yet and still does not consider the NAs:
GroupID1 <- df$GroupID[1:nrow,]
TreffID1 <- df$TreffID[1:nrow,]
for(i in 1:(GroupID1 -1){
for(j in 1:(TreffID1 -1){
if(df[i, 3] == df[i+1, 3]-1){
if(df[i, 15] == df[i+1, 15]-1){
df[c(i, i+1), 28] <- 1,
df[c(i, i+1), 28] <- 0
Many thanks in advance.
dput(head(df))
structure(list(GroupID = c("1", "1", "1", "1", "1", "2"), TreffID = c("1", "1",
"2", "2", "3", "1"), PersonID = c(171, 185, 171, 185,
185, 113), problemtypen_oberkategorien = c("Verständnisprobleme",
"Verständnisprobleme", "Motivationsprobleme", "Motivationsprobleme",
"Motivationsprobleme", "Motivationsprobleme"), passung.exkl = c("0",
"0", "0", "0", "1", "1")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Instead of loops, I used R's dplyr. I'm not sure if I got all your logic correct, since there was a lot there. For example, you didn't specify what would happen for NA problemtype and 3 people. But here is a starting point that uses group_by, so you are looking within each set of rows with the same GroupID and TreffID, and then mutate and case_when, which assign values to a new column, according to criteria, and then functions like n() that count how many rows and n_distinct that count distinct rows so you if it is ==1 then we know they are all the same.
library(tidyverse)
df <- df %>%
group_by(GroupID, TreffID) %>%
mutate(agreement_levels = case_when(n() == 1 ~ -1,
is.na(problemtypen_oberkategorien) & n() == 2 ~ 0,
is.na(problemtypen_oberkategorien) & n() > 2 ~ -1,
n_distinct(problemtypen_oberkategorien, na.rm = FALSE) == 1 ~ 1,
n_distinct(problemtypen_oberkategorien, na.rm = FALSE) > 1 ~ 0,
TRUE ~ -1),
agreement_levels = na_if(agreement_levels, -1)) %>%
select(GroupID, TreffID, problemtypen_oberkategorien, agreement_levels, everything())
I googled my error, but that didn't helped me.
Got a data frame, with a column x.
unique(df$x)
The result is:
[1] "fc_social_media" "fc_banners" "fc_nat_search"
[4] "fc_direct" "fc_paid_search"
When I try this:
df <- spread(data = df, key = x, value = x, fill = "0")
I got the error:
Error in `[.data.frame`(data, setdiff(names(data), c(key_var, value_var))) :
undefined columns selected
But that is very weird, because I used the spread function (in the same script) different times.
So I googled, saw some "solutions":
I removed all the "special" characters. As you can see, my unique
values do not contain special characters (cleaned it). But this didn't
help.
I checked if there are any columns with the same name. But all column names
are unique.
#Gregor, #Akrun:
> str(df)
'data.frame': 100 obs. of 22 variables:
$ visitor_id : chr "321012312666671237877-461170125342559040419" "321012366667112237877-461121705342559040419" "321012366661271237877-461170534255901240419" "321012366612671237877-461170534212559040419" ...
$ visit_num : chr "1" "1" "1" "1" ...
$ ref_domain : chr "l.facebook.com" "X.co.uk" "x.co.uk" "" ...
$ x : chr "fc_social_media" "fc_social_media" "fc_social_media" "fc_social_media" ...
$ va_closer_channel : chr "Social Media" "Social Media" "Social Media" "Social Media" ...
$ row : int 1 2 3 4 5 6 7 8 9 10 ...
$ : chr "0" "0" "0" "0" ...
$ Hard Drive : chr "0" "0" "0" "0" ...
The error could be due to a column without a name i.e "". Using a reproducible example
library(tidyr)
spread(df, x, x)
Error in [.data.frame(data, setdiff(names(data), c(key_var,
value_var))) : undefined columns selected
We could make it work by changing the column name
names(df) <- make.names(names(df))
spread(df, x, x, fill = "0")
# X fc_banners fc_direct fc_nat_search fc_paid_search fc_social_media
#1 1 0 0 0 0 fc_social_media
#2 2 fc_banners 0 0 0 0
#3 3 0 0 fc_nat_search 0 0
#4 4 0 fc_direct 0 0 0
#5 5 0 0 0 fc_paid_search 0
data
df <- data.frame(x = c("fc_social_media", "fc_banners",
"fc_nat_search", "fc_direct", "fc_paid_search"), x1 = 1:5, stringsAsFactors = FALSE)
names(df)[2] <- ""