I am running into an error message when trying to create a table using tidyverse. The error message reads
"Factor Com.Race contains implicit NA, consider using
forcats::fct_explicit_na".
I am noob when it comes to the tidyverse. So I haven't been able to try much.
Major_A <- rep("Major A", times=150)
set.seed(1984)
gender <- sample(c("Female","Male"), prob=c(.95,.05),size=150, replace=T)
race.asian <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.black <- sample(c("Y","N"),prob= c(.1,.9),size=150, replace=T)
race.AmInd <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.hawa <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.hisp <- sample(c("Y","N"),prob= c(.02,.98),size=150, replace=T)
race.white <- sample(c("Y","N"),prob=c(.8,.2),size=150,replace=T)
race.NotR <- sample(c("Y","N"),prob=c(.01,.98),size=150,replace=T)
degree <- sample(c("BA","MAT"),prob=c(.9,.1),size=150,replace=T)
enroll <- data.frame(Major_A,gender,race.asian,race.black,race.AmInd,race.hawa,race.hisp,race.white, race.NotR, degree)
multi.race_fun <- function(dat,startr,endr){
dat$multi <- rowSums(dat[,startr:endr]=="Y")
return(dat)
}
enroll.multiR <- multi.race_fun(enroll,3,9)
# load comrace function
com_race.fun <- function(dat){
dat$Com.Race <- ifelse(dat$race.hisp=="Y","Hispanic",
ifelse(dat$race.black=="Y" & dat$multi==1, "African Am",
ifelse(dat$race.AmInd=="Y" & dat$multi==1,"Native Am",
ifelse(dat$race.asian=="Y" & dat$multi==1,"Asian",
ifelse(dat$race.hawa=="Y" & dat$multi==1, "Hawaiian",
ifelse(dat$race.white=="Y" & dat$multi==1,"Caucasian",
ifelse(dat$multi>=2,"Two or More Races","Not Reported")))))))
return(dat)
}
# run comrace function
enroll.comR <- com_race.fun(enroll.multiR)
enroll.comR$gender <- factor(enroll.comR$gender, levels= c("Female", "Male"))
enroll.comR$Com.Race <- factor(enroll.comR$Com.Race, levels=c("African Am","Asian","Caucasian","Hawaiian","Hispancic","Two or More Races", "Not Reported"))
library(tidyverse)
gen_race.tbl<- enroll.comR%>%
group_by(Com.Race, gender, .drop = FALSE) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(perc = (count/sum(count)*100)) %>%
gather(key, value, -gender, -Com.Race) %>%
unite(Com.Race, Com.Race, key) %>%
spread(Com.Race, value)
I would like the code to produce a table with counts and percents for all level of the gender and Com.Race variables.
I would suggest using gather() from dplyr to restructure your wide-format data right at the start, then you can summarize the counts/percentages for each level of the gender and ethnicity variables. Using reshape2::dcast() at the end will give your desired output, but spread() can also be used.
# toy data set
df <- data.frame(gender=sample(c('M','F'),100,T,prob=c(0.9,0.1)),
ethn.a=sample(c('Y','N'),100,T,prob=c(0.8,0.2)),
ethn.b=sample(c('Y','N'),100,T,prob=c(0.7,0.3)),
ethn.c=sample(c('Y','N'),100,T,prob=c(0.25,0.75)),
ethn.d=sample(c('Y','N'),100,T,prob=c(0.95,0.05)))
# gather wide data, group by gender/ethnicity, summarise, reshape to wide format
df %>% gather(k,v,-gender) %>% group_by(gender,k,v) %>%
summarise(n=n()) %>% mutate(perc=round((n/sum(n))*100,2)) %>%
mutate(cell=paste0(n,' (',sprintf("%.1f",perc),'%)')) %>%
select(-n,-perc) %>%
filter(v=='Y') %>% reshape2::dcast(v~k+gender, value.var = 'cell')
v ethn.a_F ethn.a_M ethn.b_F ethn.b_M ethn.c_F ethn.c_M ethn.d_F ethn.d_M
1 Y 11 (84.6%) 69 (79.3%) 10 (76.9%) 66 (75.9%) 3 (23.1%) 28 (32.2%) 12 (92.3%) 87 (100.0%)
# using spread()
df %>% gather(k,v,-gender) %>% group_by(gender,k,v) %>%
summarise(n=n()) %>% mutate(perc=round((n/sum(n))*100,2)) %>%
mutate(cell=paste0(n,' (',sprintf("%.1f",perc),'%)')) %>%
select(-n,-perc) %>%
filter(v=='Y') %>%
spread(k,cell,fill=0)
# A tibble: 2 x 6
# Groups: gender [2]
gender v ethn.a ethn.b ethn.c ethn.d
<fct> <chr> <chr> <chr> <chr> <chr>
1 F Y 11 (84.6%) 10 (76.9%) 3 (23.1%) 12 (92.3%)
2 M Y 69 (79.3%) 66 (75.9%) 28 (32.2%) 87 (100.0%)
Related
This is the edited version of the question.
I need help to convert my wide data to long format data using the pivot_longer() function in R. The main problem is wanting to create long data with a variable nested in another variable.
For example, if I have wide data like this, where
variable fu1 and fu2 are variables for the follow-up (in days). There are two follow-up events (fu1 and fu2)
variables cpass and is are the results of two tests at each follow up
IDno <- c(1,2)
Sex <- c("M","F")
fu1 <- c(13,15)
fu2 <- c(20,18)
cpass1 <- c(27, 85)
cpass2 <- c(33, 90)
is1 <- c(201, 400)
is2 <- c(220, 430)
mydata <- data.frame(IDno, Sex,
fu1, cpass1, is1,
fu2, cpass2, is2)
mydata
which looks like this
And now, I want to convert it to long format data, and it should look like this:
I have tried the codes below, but they do not produce the data frame in the format that I want:
#renaming variables
mydata_wide <- mydata %>%
rename(fu1_day = fu1,
cp_one = cpass1,
is_one = is1,
fu2_day = fu2,
cp_two = cpass2,
is_two = is2)
#pivoting
mydata_wide %>%
pivot_longer(
cols = c(fu1_day, fu2_day),
names_to = c("fu", ".value"),
values_to = "day",
names_sep = "_") %>%
pivot_longer(
cols = c("cp_one", "is_one", "cp_two", "is_two"),
names_to = c("test", ".value"),
values_to = "value",
names_sep = "_")
The data frame, unfortunately, looks like this:
I have looked at some tutorials but have not found the best solution for this problem. Any help is very much appreciated.
library(tidyverse)
mydata %>% # the "nested" pivoting must be done within two calls
pivot_longer(cols=c(fu1,fu2),names_to = 'fu', values_to = 'day') %>%
pivot_longer(cols=c(starts_with('cpass'), starts_with('is')),
names_to = 'test', values_to = 'value') %>%
# with this filter check not mixing the tests and the follow-ups
filter(str_extract(fu,"\\d") == str_extract(test,"\\d")) %>%
mutate(test = gsub("\\d","",test)) # remove numbers in strings
Output:
# A tibble: 8 × 6
IDno Sex fu day test value
<dbl> <chr> <chr> <dbl> <chr> <dbl>
1 1 M fu1 13 cpass 27
2 1 M fu1 13 is 201
3 1 M fu2 20 cpass 33
4 1 M fu2 20 is 220
5 2 F fu1 15 cpass 85
6 2 F fu1 15 is 400
7 2 F fu2 18 cpass 90
8 2 F fu2 18 is 430
I'm not sure if your example is your real expected output, the first dataset and the output example that you describe do not show the same information.
I took inspiration from almost similar post from How to reshape Panel / Longitudinal survey data from wide to long format using pivot_longer and from the solution provided by RobertoT and put together these codes:
STEP 1: Generate wide data for simulation
IDno <- c(1,2)
Sex <- c("M","F")
fu1_day <- c(13,15)
fu2_day <- c(20,18)
fu1_cpass <- c(27, 85)
fu2_cpass <- c(33, 90)
fu1_is <- c(201, 400)
fu2_is <- c(220, 430)
mydata_wide <- data.frame(IDno, Sex,
fu1_day, fu1_cpass, fu1_is,
fu2_day, fu2_cpass, fu2_is)
mydata_wide
STEP 1: CONVERT TO LONG DATA (out1)
out1 <- mydata_wide %>%
select(IDno, contains("day")) %>%
pivot_longer(cols = c(fu1_day, fu2_day),
names_to = c('fu', '.value'),
names_sep="_")
out1
STEP 2: CREATE ANOTHER LONG DATA AND JOIN WITH out1
mydata_wide %>%
select(-contains('day')) %>%
pivot_longer(cols = -c(IDno, Sex),
names_to = c('fu', 'test'),
names_sep="_") %>%
left_join(out1)
The result looks like this
I need to calculate the overall ontime percentage of each airline with this sample dataset.
library(tidyverse)
library(dplyr)
df_chi <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,527,'CHI',
'delta',FALSE,92,'CHI',
'american',TRUE,4229,'CHI',
'american',FALSE,825,'CHI'
)
df_nyc <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,1817,'NYC',
'delta',FALSE,567,'NYC',
'american',TRUE,1651,'NYC',
'american',FALSE,625,'NYC'
)
I have a solution although it is verbose and I want to avoid the numbered index ie [2,2]. Is there a more elegant way using more of the tidyverse?
df_all <- bind_rows(df_chi,df_nyc)
delta_ot <- df_all %>%
filter(airline == "delta") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
delta_ot <- delta_ot[2,2] / sum(delta_ot$total)
american_ot <- df_all %>%
filter(airline == "american") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
american_ot <- american_ot[2,2] / sum(american_ot$total)
As on the ontime column is logical column, use that to subset instead of [2, 2]. Also, instead of doing the filter, do this once by adding the 'airline' as a grouping column
library(dplyr)
bind_rows(df_chi, df_nyc) %>%
group_by(airline, ontime) %>%
summarise(total = sum(qty), .groups = 'drop_last') %>%
summarise(total = total[ontime]/sum(total))
-output
# A tibble: 2 × 2
airline total
<chr> <dbl>
1 american 0.802
2 delta 0.781
Subsetting by logical returns the corresponding value where there are TRUE elements
> c(1, 3, 5)[c(FALSE, TRUE, FALSE)]
[1] 3
I have a dataframe like this:
prov_id <- c(599,599,599,599,599,599,599,699,699,699,699,699,699,699,699)
mbr_id <- c(100,101,102,103,103,104,105,200,201,201,202,203,203,204,205)
prov_state <- c("CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
amount <- c(3,5,2,28,12,17,10,6,33,31,161,24,22,12,17)
df.sample <- data.frame(prov_id,mbr_id,prov_state,amount,stringsAsFactors=FALSE)
I am trying to compute the averages of amounts by provider and state this way:
library(tidyverse)
# get the member counts for each provider by state
df.sample.memcnt <-
df.sample %>%
select (prov_id, prov_state, mbr_id) %>%
distinct(prov_id, prov_state, mbr_id) %>%
group_by(prov_id, prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(mem_cnt_pvdr = n)
# get the provider counts for each state
df.sample.pvdrcnt <-
df.sample %>%
select (prov_id, prov_state) %>%
distinct(prov_id, prov_state) %>%
group_by(prov_state) %>%
tally(sort=T) %>%
ungroup() %>%
rename(pvdr_cnt_state = n)
# get the mean total amount of providers
df.sample.pvdr <-
df.sample %>%
select (prov_id,prov_state,amount) %>%
group_by(prov_id,prov_state) %>%
summarise(total_amt = sum(as.numeric(amount))) %>%
ungroup() %>%
inner_join(df.sample.memcnt, by = c("prov_id","prov_state")) %>%
mutate(mean_total_amt_pvdr =
round((total_amt / mem_cnt_pvdr),2)) %>%
select(-total_amt)
# get the mean total amount of the state
df.sample.state <-
df.sample.pvdr %>%
group_by(prov_state) %>%
summarise(total_amt_state = sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state = sum(mem_cnt_pvdr)) %>%
ungroup() %>%
inner_join(df.sample.pvdrcnt, by = c("prov_state")) %>%
mutate(mean_total_amt_state =
round((total_amt_state / pvdr_cnt_state),2)) %>%
select(-total_amt_state)
# merge provider df with state df
df.final <- df.sample.pvdr %>%
inner_join(df.sample.state)
While I get the output I need, I feel this is very inefficient.
Desired output:
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
599 CA 6 12.8 12 2 31.9
699 CA 6 51 12 2 31.9
Is there a way to get the desired output with a few lines of code?
A bit shorter with data.table and uniqueN:
library(data.table)
setDT(df.sample)
df.sample.state.prov<- df.sample[,{mem_cnt_pvdr=uniqueN(mbr_id );
mean_total_amt_pvdr=round(sum(as.numeric(amount))/mem_cnt_pvdr,2);
.(mem_cnt_pvdr,mean_total_amt_pvdr)},by=.(prov_state,prov_id)]
df.sample.state <- df.sample.state.prov[,.(pvdr_cnt_state=uniqueN(prov_id ),
total_amt_state=sum(as.numeric(mean_total_amt_pvdr)),
mem_cnt_state=sum(mem_cnt_pvdr)),by=.(prov_state)]
df.sample.state[df.sample.state.prov,.(prov_id,
prov_state,
mem_cnt_pvdr,
mean_total_amt_pvdr,
mem_cnt_state,
pvdr_cnt_state,
mean_total_amt_state=total_amt_state/pvdr_cnt_state) ,on=.(prov_state)]
prov_id prov_state mem_cnt_pvdr mean_total_amt_pvdr mem_cnt_state pvdr_cnt_state mean_total_amt_state
1: 599 CA 6 12.83 12 2 31.915
2: 699 CA 6 51.00 12 2 31.915
I am trying filtered data with value having 1 but the dataframe is already labelled . so the objective is to create a summary of filtered dataset
df <- data.frame(NY = c(1,2,1,1,2,1,1,1,2,1,1,1,2,1),
DE = c(2,1,1,1,1,2,2,1,1,1,2,2,2,1) )
df$NY<- factor(df$NY, levels =c(1,2), labels = c("unavailable","available"))
df$DE<- factor(df$DE, levels =c(1,2), labels = c("rejected","recieved"))
output is the frequency of "available" in both column
available/ total frequency in NY and DE for "recieved"
the output should be look like
If output in this format is useful?
library(janitor)
library(tidyverse)
df %>% pivot_longer(everything()) %>%
tabyl(name, value) %>%
adorn_percentages() %>%
adorn_pct_formatting(digits = 2)
#> name available unavailable
#> DE 50.00% 50.00%
#> NY 71.43% 28.57%
In case of revised scenario
df %>% pivot_longer(everything()) %>%
tabyl(value, name) %>%
adorn_percentages('col') %>%
filter(value %in% c('available', 'recieved')) %>%
adorn_totals('row') %>%
adorn_pct_formatting(digits = 2) %>%
tail(1)
value DE NY
Total 50.00% 71.43%
Here's a tidyverse approach to your problem which outputs the percentages as decimal:
library(tidyverse)
df %>% summarise(across(everything(), ~ sum(. == "available")/n()))
Output:
NY DE
1 0.7142857 0.5
You can try map_df() through each column.
df %>%
map_df(
~ (mean(. == "available") * 100) %>%
round() %>%
paste("%")
)
# # A tibble: 1 x 2
# NY DE
# <chr> <chr>
# 1 71 % 50 %
For different desired values, one approach is to create a named vector as below and pass it into a customised function. Note that the output is a character vector but you can change it as necessary.
values <- c(NY = "available",
DE = "received")
get_percent <- function(.data, .values) {
vars <- names(.values)
pct <- sapply(
seq_along(.values),
function(.) round(mean(.data[[ vars[.] ]] == .values[vars[.]]) * 100)
)
pct <- paste0(pct, "%")
names(pct) <- vars
pct
}
res <- get_percent(df, values)
res
# NY DE
# "29%" "43%"
I'd like to make a table that looks like this
I have tibbles with each of the data points, but they're not combined.
library('dplyr')
library('ISLR')
data(Hitters)
Hitters <- na.omit(Hitters)
Q <- Hitters %>% group_by(League) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
A <- Hitters %>% group_by(Division) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
Z <- Hitters %>% group_by(NewLeague) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
My goal is to stack the tibbles above each other in one output with shared "count" and "avg_wage" columns. I tried bind_rows() and ftable(), without success.
The problem is that you can't combine rows with different column names so it ends up giving you a confusing dataframe. We can instead use gather() to create two new columns and get the proper table.
library(tidyverse)
library(ISLR)
data(Hitters)
Hitters <- na.omit(Hitters)
Q <- Hitters %>% group_by(League) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
A <- Hitters %>% group_by(Division) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
Z <- Hitters %>% group_by(NewLeague) %>%
dplyr::summarize(count = n(), avg_wage = sum(Salary)/n())
list(Q,A,Z) %>%
map_df(bind_rows) %>%
gather("league_type", "league_id", c(1, 4, 5)) %>%
filter(!is.na(league_id))
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
#> # A tibble: 6 x 4
#> count avg_wage league_type league_id
#> <int> <dbl> <chr> <chr>
#> 1 139 542. League A
#> 2 124 529. League N
#> 3 129 624. Division E
#> 4 134 451. Division W
#> 5 141 537. NewLeague A
#> 6 122 535. NewLeague N
Created on 2019-01-21 by the reprex package (v0.2.1)
You can use spread() to get it back to wide format, although I would advise against that. The long version will probably be easier to work with.