R replace_na values conditionally by column with multiple conditions - r

My question is similar to other replace_na posts but I can't find the right combination of answers.
I have a dataframe with inflation rates for all countries over 8 years (wide format - countries as rows and years as columns).
Some countries have NAs for all 8 years (columns 3:10), and in that case I want to replace all NAs with the column mean
library(tidyverse)
sample %>%
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T))
This is close but something is wrong.
Other countries only have NAs in some columns, in which case I want to replace NA with the previous year's value
library(zoo)
sample %>%
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T)),
is.na[,4:10], na.locf(fromLast = TRUE)))
Tried using na.locf from the zoo package but can't get it right with the other conditions
The final condition is that, if the NA is in the first year (2007), I want to replace it with the 2007 column mean instead of the next year (2008 was the financial crisis so all the inflation rates went nuts).
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T)),
is.na[,4:10], na.locf(fromLast = TRUE)),
is.na("2007"), replace = colMeans("2007", na.rm = TRUE))
But this is full of errors and I'm stuck trying to link all these conditions together - pretty new to ifelse statements. I'm trying to find a dplyr solution as that's the syntax I'm most familiar with, but maybe it's easier in base R or data.table
running R 3.6.1
sample <- structure(list(`Country Name` = c("Aruba", "Afghanistan", "Angola",
"Albania", "Andorra", "Arab World", "United Arab Emirates", "Argentina",
"Armenia", "American Samoa", "Antigua and Barbuda", "Australia"
), `Country Code` = c("ABW", "AFG", "AGO", "ALB", "AND", "ARB",
"ARE", "ARG", "ARM", "ASM", "ATG", "AUS"), `2007` = c(5.39162036843645,
8.68057078513406, 12.2514974459487, 2.93268248162318, NA, 4.74356585295154,
NA, NA, 4.40736089644519, NA, 1.41605259409743, 2.32761128891476
), `2008` = c(8.95722105296535, 26.4186641547444, 12.4758291326398,
3.36313757366391, NA, 11.2706652380848, 12.2504202448139, NA,
8.94995335353386, NA, 5.33380639820232, 4.35029854990047), `2009` = c(-2.13630037272305,
-6.81116108898995, 13.7302839288409, 2.23139683475865, NA, 2.92089711805365,
1.55980098148558, NA, 3.40676682683799, NA, -0.550159995508869,
1.77111716621252), `2010` = c(2.07773902027782, 2.1785375238942,
14.4696564932574, 3.61538461538463, NA, 3.91106195534027, 0.879216764156813,
NA, 8.17636138473956, NA, 3.3700254022015, 2.91834002677376),
`2011` = c(4.31633194082721, 11.8041858089129, 13.4824679218511,
3.44283593170005, NA, 4.75316388885632, 0.877346595685083,
NA, 7.6500080785929, NA, 3.45674967234599, 3.30385015608744
), `2012` = c(0.627927921638161, 6.44121280934118, 10.2779049218839,
2.03642235579081, NA, 4.61184432206646, 0.662268900269082,
NA, 2.55802007757907, NA, 3.37688044338879, 1.76278015613193
), `2013` = c(-2.37226328015073, 7.38577178397857, 8.77781429332619,
1.92544399507649, NA, 3.23423783752364, 1.10111836375706,
NA, 5.78966778544654, NA, 1.05949782356168, 2.44988864142539
), `2014` = c(0.421637771012246, 4.67399603536339, 7.28038730361125,
1.61304235314414, NA, 2.77261158414198, 2.34626865671643,
NA, 2.98130868933673, NA, 1.08944157435363, 2.48792270531403
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L))

First compute a logical vector, all.na having one component per row which is TRUE if that row's numeric data is all NAs and FALSE otherwise. Then use na.aggregate to fill in all-NA rows. Also use na.aggregate on 2007. Then convert to long form and apply na.locf0 by country and convert back to wide form.
library(dplyr)
library(tidyr)
library(zoo)
all.na <- sample %>%
select_if(is.numeric) %>%
{ rowSums(is.na(.)) == ncol(.) }
sample %>%
mutate_at(-(1:3), ~ if_else(all.na, na.aggregate(.x), .x)) %>%
mutate(`2007` = na.aggregate(`2007`)) %>%
gather(key, value, -`Country Name`, -`Country Code`) %>%
group_by(`Country Name`, `Country Code`) %>%
mutate(value = na.locf0(value)) %>%
ungroup %>%
spread(key, value)
or using only zoo:
library(zoo)
all.na <- apply(is.na(sample[grep("^2", names(sample))]), 1, all)
ix <- -(1:3)
sample.out <- sample
Fill <- function(x) ifelse(all.na, na.aggregate(x), x)
sample.out[ix] <- lapply(sample[ix], Fill)
sample.out$"2007" <- na.aggregate(sample.out$"2007")
sample.out[ix] <- t(apply(sample.out[ix], 1, na.locf0))

Tried to do the same by only using the tidyverse (dplyr and tidyr), this is what I got:
Created the Replacement according to condition 1. Column Means.
# Getting the Column Means to Replace according to Condition 1 and 3.
replacement <- sample %>%
select_if(is.numeric) %>%
summarize_all( ~ mean(., na.rm = TRUE)) %>%
#Transformed to List since it is a requirement for tidyr::replace_na()
as.list()
Then I created everything into just one pipeline.
sample %>%
pivot_longer(`2007`:`2014`, names_to = "year", values_to = "int_rate") %>%
group_by(`Country Name`) %>%
summarize(na_num = is.na(int_rate) %>% sum) %>%
#Joining the number of NAs na_num as a new column
left_join(sample, by = "Country Name") %>%
#Replacing 2007 missing as a first value. Condition 3.
mutate(`2007` = if_else(between(na_num, 1, 7) &
is.na(`2007`), replacement[[1]] , `2007`)) %>%
#Making dataset wider
pivot_longer(`2007`:`2014`, names_to = "year", values_to = "int_rate") %>%
group_by(`Country Name`) %>%
#Using fill to impute NAs with the previous one. Condition 2.
fill(int_rate) %>%
pivot_wider(names_from = year, values_from = int_rate) %>%
#Replacing Values when all values are missing. Condition 1.
replace_na(replace = replacement)

Related

Trying to calculate percentages after using dplyr::count

I am trying to calculate the percentages for cigarettes smoking status by sex (for example, the % of males/females who are Non-smokers, Occasional smokers, Prefer not to say, Regular smokers etc). The default seems to calculate the percentage from the Row Total and not the Column Total. Any help would be greatly appreciated.
Dataframe
structure(list(sex = c("Female", "Male", "Female", "Female"),
cigarettes_smoking_status = c("Non-smoker", "Non-smoker",
"Non-smoker", "Non-smoker")), row.names = c(NA, 4L), class = "data.frame")
Code
smoking_status_by_sex <- smoking_data %>%
group_by(sex) %>%
dplyr::count(cigarettes_smoking_status) %>%
pivot_wider(names_from = sex, values_from = n) %>% #increase number of columns & reduce rows
adorn_totals(c("row", "col") )
smoking_status_by_sex_per <- smoking_status_by_sex %>%
mutate(female_pct = round((100*.[[2]]/Total),digits =2),
male_pct = round((100*.[[3]]/Total),digits =2),
prefer_not_to_say_pct = round((100*.[[4]]/Total), digits=2),
unknown_pct = round((100*.[[5]]/Total),digits =2),
total_pct = round((100*.[[6]]/Total), digits=2))
This is the table I am trying to replicate below
[What I am trying to replicate][1]
[1]: https://i.stack.imgur.com/hhDA4.png
I have tried using count, colSum, adorn_totals etc and then tried to use pivot_wider. Any help would be greatly appreciated.
Its easier to group_by sex and smoking status and then compute the relative frequencies. An example is given below.
library(tidyverse)
df<-starwars
df %>%
group_by(eye_color,skin_color) %>% ##grouping by eyecolor and skin color!
summarise(count1=n()) %>%
mutate(grouppercentage=(count1/sum(count1))*100)

R: Plotting a Graph with NA's

I am working with the R programming language.
I have the following data that contains 10 measurements for a set of people (and includes NA's):
my_data <- structure(list(id = 1:20, weight_time_1 = c(NA, NA, NA, NA, 99.4800556826432,
NA, NA, NA, NA, 92.7723003148797, NA, 102.130637355002, NA, NA,
96.4306038435274, 117.519167258681, NA, NA, NA, NA), weight_time_2 = c(NA,
NA, NA, 100.096037354425, 98.5573457978251, NA, 99.2565971422039,
NA, NA, 78.2178327860056, NA, 93.1290042175411, NA, 105.999332486733,
102.324404273109, 106.249390147503, NA, NA, NA, NA), weight_time_3 = c(NA,
NA, NA, 109.653641754063, 108.67612106402, 89.245436013972, 76.0388764710753,
NA, 121.434141230992, 93.5040344542738, NA, 106.261290772666,
NA, 107.27650959864, 99.9614325607138, 106.822602397336, NA,
NA, NA, NA), weight_time_4 = c(NA, NA, NA, 83.4057073444694,
100.0475658129, 101.181524203485, 109.854456857605, NA, 109.39925298469,
100.127289780991, NA, 92.3537705948637, NA, 97.484431731186,
93.1880798156964, 98.2949614096827, NA, NA, NA, NA), weight_time_5 = c(85.9705471396862,
NA, 101.810197281424, 125.878759238011, 90.5377892614597, 100.977860860978,
105.206211167738, 105.925495763829, 95.0038093722839, 91.7697262180746,
112.751436397665, 89.3570085447357, NA, 105.334871042565, 107.101908594036,
121.466895783898, NA, NA, NA, NA), weight_time_6 = c(91.3939219450539,
NA, 102.295063295212, 112.648885364836, 92.858993235862, 84.9768973349691,
106.268407819189, 91.2142736262532, 94.5206092516322, 106.102317632812,
106.800383289515, 96.8243417950671, 112.526148273022, 96.0060934996047,
108.127666530717, 100.80395850135, NA, NA, NA, 97.1665601525516
), weight_time_7 = c(78.1538622765699, NA, 98.3267913598314,
97.694334342899, 88.2573884491152, 94.0391463446378, 79.107127345042,
98.6717305266368, 87.4584802875, 91.0212929680695, 115.449312672637,
108.505222479846, 87.7272780928247, 98.2950591116351, 108.64305435295,
100.971252881422, NA, NA, NA, 89.7627845887151), weight_time_8 = c(88.9847618154833,
NA, 75.9578295182105, 123.066624773516, 103.899907028919, 86.3922722708996,
101.056470605625, 93.9274704914096, 116.225266396545, 119.261812971557,
120.470004522712, 95.1540411812936, 103.625912955529, 119.112226243372,
97.2548085647629, 93.4809837458108, NA, 107.551887082473, 103.626395948971,
92.497583506856), weight_time_9 = c(106.965867937613, NA, 111.885847224286,
95.4347167550049, 89.629232996398, 99.279432759281, 111.111236025807,
106.187409603617, 95.0731389891664, 102.40946902701, 98.7215766413794,
108.440350789909, 111.841323303161, 98.6631240530225, 108.178201457868,
102.289607726024, 108.679229829576, 93.9424920702776, 102.660681952024,
90.7932196785015), weight_time_10 = c(98.5452360068031, 100.417384196154,
94.4492002344181, 100.711643341273, 119.565187908911, 103.54455492062,
74.0330331656656, 103.431332886172, 112.355083085616, 100.345180859457,
97.3988962137931, 96.9401740645521, 116.008033135044, 106.302406861972,
96.7028852299552, 111.699115637383, 95.3519501717543, 89.9061904342833,
107.36861168758, 102.797106848808)), row.names = c(NA, 20L), class = "data.frame")
I would like to make a "longitudinal" graph for this data. I tried to do this two different ways:
Option 1: https://cran.r-project.org/web/packages/lcsm/vignettes/v0-longitudinal-plots.html
library(lcsm)
library(ggplot2)
library(tidyr)
library(dplyr)
library(stringr)
x_var_list <- c("weight_time_1", "weight_time_2", "weight_time_3", "weight_time_4", "weight_time_5", "weight_time_6", "weight_time_7", "weight_time_8", "weight_time_9", "weight_time_10")
plot_trajectories(data = my_data,
id_var = "id",
var_list = x_var_list,
xlab = "Time", ylab = "Value",
connect_missing = FALSE,
random_sample_frac = 1,
title_n = TRUE)
This seemed to have worked, but produces a warning message stating that NA's were not plotted:
Warning messages:
1: Removed 64 row(s) containing missing values (geom_path).
2: Removed 64 rows containing missing values (geom_point).
Option 2: https://www.r-bloggers.com/2015/08/managing-longitudinal-data-conversion-between-the-wide-and-the-long/#google_vignette
dat <- reshape(my_data, varying= c("weight_time_1", "weight_time_2", "weight_time_3", "weight_time_4", "weight_time_5", "weight_time_6", "weight_time_7", "weight_time_8", "weight_time_9", "weight_time_10"), idvar="id", direction="long")
library(ggplot2)
ggplot(dat, aes(x=time, y=measure, colour=tx, group=id)), geom_line(alpha=.5)
But this returns the following error: Error in guess(varying) :
failed to guess time-varying variables from their names
Can someone please show me how to fix this and plot this data? I would like the NA's to appear on the graph.
Thanks!
NAs cannot be represented as data points. However, they can be made indirectly visible by plotting the probands separately, so that the presence of NAs is obvious.
library(tidyverse)
my_data <- as_tibble(my_data)
my_data <- my_data %>%
pivot_longer(-id, names_to = "tp", values_to = "measure") %>%
mutate(
tp = parse_number(tp),
tp = factor(tp),
id = factor(id)
)
my_data %>%
ggplot(aes(tp, measure, col = id, group = id)) +
geom_point() +
geom_smooth(method = "loess", se = F) +
theme(legend.position = "none") +
facet_wrap(~id)
NAs are, by definition, no data and therefore cannot be represented graphically. Already the lines between the points are strictly speaking wrong because one does not know the data between the points. Therefore, technically correct would be to represent the existing data only as points. Within the known data range, one can try to connect the points as best as possible with a smoothing line. More complex modeling is needed for the areas outside the known data range.
In short, it is wrong for NAs to appear in the graph.
Your option 1 is probably fine. It's just warning you that it is impossible to plot an NA. Here is the ggplot2 version, you need to make the wide data long.
my_data_long <- my_data %>%
tidyr::pivot_longer(-id, names_to = "time", values_to = "Value") %>%
drop_na() %>%
mutate(id = factor(id))
ggplot(my_data_long, aes(x = time, y = Value, color = factor(id))) +
geom_point() +
geom_line(aes(group = id)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = -90)
You could also use dygraphs which is quite straightforward for this use case and handles nicely NAs:
library(dygraphs)
dygraph(my_data) %>% dyLegend(show = "always")
Further formatting options can be found here
update - apparently the OP wanted to explicitly plot missing values. see further below for one approach
There are plenty of options to deal with NAs when plotting with ggplot2.
Just leave them, and accept the warning (really nothing wrong with that)
drop NA before plotting see JeffV's answer using tidyr::drop_na, but there are many ways, see this ultra-popular thread
in your case, you can drop NA when pivoting - use tidyr::pivot_longer(..., values_drop_na = TRUE)
add na.rm = TRUE to the geom of interest:
library(ggplot2)
library(dplyr)
library(tidyr)
my_data %>%
pivot_longer(cols = starts_with("weight")) %>%
# your x is essentially continuous. Thus make it REALLY continuous!
# your id is categorical, so make it that
mutate(time = as.integer(gsub(".*([0-9]+)", "\\1", name)),
id = as.character(id)) %>%
ggplot(aes(x=time, y=value, colour=id, group=id)) +
geom_line(alpha=.5, na.rm = TRUE)
Visualising NAs in a line plot
It is a whole new problem "how to visualise NAs". There is the {naniar} package which helps visualising NAs, but to my knowledge not "within" a line plot. One way to do that would to first interpolate or impute NAs based on the present data. This should not be the place to discuss the best way to do this, but here a quick way using the zoo package.
my_data_long <- my_data %>%
pivot_longer(cols = starts_with("weight")) %>%
mutate(time = as.integer(gsub(".*([0-9]+)", "\\1", name)),
id = factor(id, levels = 1:max(id))) %>%
group_by(id) %>%
## interpolate NA's with the zoo package
mutate(na_ip = zoo::na.approx(value, time, na.rm = FALSE))
## store your NA's in a different frame
my_nas <- my_data_long %>% filter(is.na(value))
ggplot(my_data_long, aes(x=time, y=value, colour=id, group=id)) +
## e.g., use the interpolated values for dashed lines
geom_line(data = my_nas, aes(y = na_ip), lty = 2) +
geom_line(alpha=.5, na.rm = TRUE) +
## because this is otherwise a complete visual disaster, I'm untangling with facet
facet_wrap(~id) +
theme(legend.position = "none")
#> Warning: Removed 9 row(s) containing missing values (geom_path).
#> geom_path: Each group consists of only one observation. Do you need to adjust
#> the group aesthetic?
#> geom_path: Each group consists of only one observation. Do you need to adjust
#> the group aesthetic?
#> geom_path: Each group consists of only one observation. Do you need to adjust
#> the group aesthetic?
#> geom_path: Each group consists of only one observation. Do you need to adjust
#> the group aesthetic?

Error in env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings)) : attempt to use zero-length variable name

I am trying to filter every column of my dataframe with a certain threshold (in this case >= 1.2) with dplyrs filter function. It worked nicely so far, but suddenly I get this error message, when I try to run the code:
Error in env_bind_lazy(private$bindings, !!!set_names(promises, names_bindings)) :
attempt to use zero-length variable name
This is part of my dataframe (it has 108 columns, some rows contain NA):
Mean 1
Mean 2
Mean 3
1.1874
1.0944
1.2376
1.2258
1.0665
1.2365
1.0953
1.1420
1.2479
1.2234
1.0949
1.0608
NA
NA
1.146
This is my code:
Heights_filtered = list()
for (i in 1:length(allHeights)){
filtered = filter(allHeights, allHeights[,i] >=1.2, .preserve = TRUE)
filterlist = cbind.data.frame(filtered[,i])
colnames(filterlist) = colnames(allHeights[i])
Heights_filtered[[i]] = cbind.data.frame(filterlist)
names(Heights_filtered) = colnames(allHeights[i])}
Do you have an idea why this happens now?
Thanks for your help!
These are the first rows of my dataframe
> dput(head(allHeights[1:10], 10))
structure(list(Mean1 = c(1.18743006611931, 1.22582285838843,
1.09595291724188, 1.22341059362058, 1.32431882583739, 1.31219937513623,
1.28004068880331, 1.29884472862021, 1.36733270362566, 1.38170457022452
), Mean2 = c(1.09447069039104, 1.09233667417252, 1.08767127319823,
1.06656658866469, 1.14203717603426, 1.09491221098798, 1.03171589621323,
1.15308990831089, 1.17585765375955, 1.11962264706315), Mean3 = c(1.23761700966768,
1.07486913672867, 1.2605330014152, 1.21512728264762, 1.23659397432181,
1.17488789237668, 1.28191444014391, 1.23137649405787, 1.22165765827209,
1.17481969002029), Mean4 = c(1.0608309164187, 1.06201740178538,
1.07512524012204, 1.07230027496328, 1.07823270179668, 1.08137782967343,
1.08704659309202, 1.09783795999849, 1.05538815021281, 1.04118799201477
), Mean5 = c(1.3872325431161, 1.34236438736957, 1.11657498580741,
1.19758040835503, 1.19718888867138, 1.12759626490222, 1.13074799835562,
1.19262768435683, 1.16498639469099, 1.2131433157802), Mean6 = c(1.18440664423239,
1.20342967777624, 1.21238802071329, 1.12420289186988, 1.22123880207133,
1.19712964243458, 1.20605725349191, 1.23989305305859, 1.21075923108837,
1.24834431998033), Mean7 = c(1.13543425248546, 1.12286625398612,
1.09469483808257, 1.10461963472656, 1.11445916679456, 1.08465067103221,
1.12117801538173, 1.08284306202145, 1.11304377483331, 1.13541719957027
), Mean8 = c(1.24793883159642, 1.19395390601616, 1.18592691355337,
1.19717830807325, 1.191232891622, 1.19336888792142, 1.17576392479116,
1.13564256754918, 1.11424178933907, 1.18585888819352), Mean9 = c(1.20505670697375,
1.18604713515832, 1.19024318309784, 1.21607636002896, 1.30812129661903,
1.24325012735609, 1.19658417567097, 1.27798482451672, 1.04137061962088,
1.30975681690216), Mean10 = c(1.06327665140615, 1.13939757285081,
1.12462757067074, 1.06967153549887, 1.08647627352663, 1.16336022091418,
1.15385873119686, 1.1672116851973, 1.22303975001817, 1.13392922026016
)), row.names = c(NA, 10L), class = "data.frame")
And here the last part of the dataframe which gives me the error:
dput(head(allHeights[100:108], 10))
structure(list(c(1.3975238170743, 1.42479618398277, 1.36302374440084,
1.33075672890157, 1.30214981303101, 1.29526565452359, 1.31860044132609,
1.23876534400972, 1.15907559361002, 1.26664552529697), c(2.22279564798051,
2.15443577725511, 2.36887256975583, 2.04737812822552, 2.21183099544832,
2.08881706966277, NA, NA, NA, NA), c(1.03731717809005, 1.07517206767995,
1.10263120160597, 1.17071264697448, 1.12660596501291, 1.07340120447376,
1.05339833667909, 1.02742328649269, 1.04743332377402, 1.09359764840837
), c(1.75325898322414, 1.80777043843246, 1.26273660420002, 1.59312822030592,
1.11652967053664, 1.62459472912435, 1.28563356786353, 1.95060067533935,
NA, NA), c(1.34261413268355, 1.30548480529631, 1.32490460208726,
1.05392855500896, 1.36887499425314, 1.12776424072456, 1.24322559882304,
1.24394280722725, 1.51098340306193, 1.35122063353409), c(1.30861179458687,
1.30802444638463, 1.32818477656957, 1.2115882212874, 1.27803793951901,
1.34488451464402, 1.2494642431939, 1.14564647987936, 1.13223271688229,
1.21111199301532), c(1.19828142850047, 1.2299458600308, 1.18492028013709,
1.24207768340535, 1.14210500173844, 1.14374410172354, 1.17129836586698,
1.20543386479909, 1.17938210897531, 1.1315377738042), c(1.06870742201506,
1.19744233297478, 1.14709573323772, 1.21291980399187, 1.19923509023545,
1.1095972272021, 1.1777817616828, 1.13757918011235, 1.18910601171268,
1.18139715549181), c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Mean100",
"Mean101", "Mean102", "Mean103", "Mean104", "Mean105", "Mean106",
"Mean107", NA), row.names = c(NA, 10L), class = "data.frame")
This solution might work and be more inline with a dplyr approach. The code below uses mtcars as an example. It will keep rows where all values are greater than or equal to one.
library(dplyr)
mtcars %>%
filter(
across(
.cols = everything(),
.fns = ~ .x >= 1
)
)
Edit: Depending if you want to handle missing data this should help. You can specify your function you want to use to filter and then apply in in the where statement. Here is another example with mtcars.
myrowfun <- function(x){is.na(x)| x >=2}
mtcars <- mutate(mtcars, mpg = NA)
mtcars$mpg[1:7] <- rep(1,7)
mtcars %>%
filter(myrowfun(
across(
.cols = everything(),
.fns = ~.x
)
)
)
edit: adding an example of how to remove all missing columns
mtcars <- mutate(mtcars, newcol = NA)
#shows which columns are not all missing
sapply(mtcars, function(x)all(!is.na(x)))
#subset on that
mtcars2 <- mtcars[, sapply(mtcars, function(x)all(!is.na(x)))]

How to convert data with different levels of information into wide format? [duplicate]

This question already has an answer here:
Reshaping data.frame with a by-group where id variable repeats [duplicate]
(1 answer)
Closed 2 years ago.
I have a data of patients' operations/procedures (example as shown in the picture below) where one row describes a patient's procedure. There are 2 levels of information,
the first being the operation details, i.e. op_start_dt, priority_operation and asa_status
the second being the procedure details, i.e. proc_desc and proc_table
An operation can have more than 1 procedures. In the example below, patient A has 2 operations (defined by distinct op_start_dt). In his first operation, he had 1 procedure (defined by distinct proc_desc) and in his second, he had 2 procedures.
I would like to convert the data into a wide format, where a patient only has one row, and his information will be arranged operation by operation and within each operation, it will be arrange procedure by procedure, as shown below. So, proc_descxy refers to the proc_desc on xth operation and yth procedure.
Data:
df <- structure(list(patient = c("A", "A", "A"), department = c("GYNAECOLOGY /OBSTETRICS DEPT",
"GYNAECOLOGY /OBSTETRICS DEPT", "GYNAECOLOGY /OBSTETRICS DEPT"
), op_start_dt = structure(c(1424853000, 1424870700, 1424870700
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), priority_operation = c("Elective",
"Elective", "Elective"), asa_status = c(2, 3, 3), proc_desc = c("UTERUS, MALIGNANT CONDITION, EXTENDED HYSTERECTOMY WITH/WITHOUT LYMPHADENECTOMY",
"KIDNEY AND URETER, VARIOUS LESIONS, NEPHROURETERECTOMY, LAPAROSCOPIC",
"HEART, VARIOUS LESIONS, HEART TRANSPLANTATION"), proc_table = c("99",
"6A", "7C")), row.names = c(NA, 3L), class = "data.frame")
Desired output:
df <- structure(list(patient = "A", department = "GYNAECOLOGY /OBSTETRICS DEPT",
no_op = 2, op_start_dt1 = structure(1424853000, class = c("POSIXct",
"POSIXt"), tzone = "UTC"), no_proc1 = 1, priority_operation1 = "Elective",
asa_status1 = 2, proc_desc11 = "UTERUS, MALIGNANT CONDITION, EXTENDED HYSTERECTOMY WITH/WITHOUT LYMPHADENECTOMY",
proc_table11 = "99", op_start_dt2 = structure(1424870700, class = c("POSIXct",
"POSIXt"), tzone = "UTC"), no_of_proc2 = 2, priority_operation2 = "Elective",
asa_status2 = 3, proc_desc21 = "KIDNEY AND URETER, VARIOUS LESIONS, NEPHROURETERECTOMY, LAPAROSCOPIC",
proc_table21 = "6A", proc_desc22 = "HEART, VARIOUS LESIONS, HEART TRANSPLANTATION",
proc_table22 = "7C"), row.names = 1L, class = "data.frame")
My attempt:
I tried to work this out, but it gets confusing along the way, with pivot_longer then pivot_wideragain.
df %>%
# Operation-level Information
group_by(patient) %>%
mutate(op_nth = dense_rank(op_start_dt),
no_op = n_distinct(op_start_dt)) %>%
# Procedure-level Information
group_by(patient, op_start_dt) %>%
mutate(proc_nth = row_number(),
no_proc = n_distinct(proc_desc)) %>%
ungroup() %>%
# Make pivoting easier
mutate_all(as.character) %>%
# Pivot Procedure-level Information
pivot_longer(-c(patient, department, no_op, op_nth, proc_nth)) %>%
# Remove the indices for "Procedure" for Operation_level Information
mutate(proc_nth = case_when(!(name %in% c("op_start_dt", "no_proc", "priority_operation", "asa_status")) ~ proc_nth)) %>%
# Create the column names
unite(name, c(name, op_nth, proc_nth), sep = "", na.rm = TRUE) %>%
distinct() %>%
pivot_wider(names_from = name, values_from = value)
Create a unique ID column for each patient and then use pivot_wider.
library(dplyr)
df %>%
group_by(patient) %>%
mutate(row = row_number()) %>%
tidyr::pivot_wider(names_from = row, values_from = op_start_dt:proc_table)

Splitting a dataframe column where new column values depend upon original data

I often work with dataframes that have columns with character string values that need to be separated. This results from a "select multiple" option in the data entry programme (which I cannot change unfortunately). I have tried tidyr::separate but that does not order the results properly. An example:
require(tidyr)
df = data.frame(
x = 1:3,
sick = c(NA, "malaria", "diarrhoea malaria"))
df <- df %>%
separate(sick, c("diarrhoea", "cough", "malaria"),
sep = " ", fill = "right", remove = FALSE)
But I want the result to look like this:
df2 = data.frame(
x = 1:3,
sick = c(NA, "malaria", "diarrhoea malaria"),
diarrhoea = c(NA, NA, "diarrhoea"),
cough = c(NA, NA, NA),
malaria = c(NA, "malaria", "malaria"))
Any help in the right direction would be much appreciated.
We can try with separate_rows and dcast
library(tidyr)
library(reshape2)
library(dplyr)
separate_rows(df, sick) %>%
mutate(sick = factor(sick, levels = c("diarrhoea", "cough", "malaria")), sick1 = sick) %>%
dcast(., x~sick, value.var = "sick1", drop=FALSE) %>%
bind_cols(., df[2]) %>%
select(x, sick, diarrhoea, cough, malaria)
# x sick diarrhoea cough malaria
#1 1 <NA> <NA> <NA> <NA>
#2 2 malaria <NA> <NA> malaria
#3 3 diarrhoea malaria diarrhoea <NA> malaria
Or another option is using cSplit from splitstackshape with dcast from data.table
library(splitstackshape)
dcast(cSplit(df, "sick", " ", "long")[, sick:= factor(sick, levels =
c("diarrhoea", "cough", "malaria"))], x~sick, value.var = "sick", drop = FALSE)[,
sick := df$sick][]

Resources