Dplyr summarize with which.max and data with NA's - r

I am working with a data set of changes over time and need to calculate the time at which the peak change occurs. I am running into a problem because some subjects have missing data (NA's).
Example:
library(dplyr)
Data <- structure(list(Subject = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L), .Label = c("1", "10", "11", "12", "13", "14", "16",
"17", "18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "3", "31", "32", "4", "5", "7", "8", "9"), class = "factor"),
Close = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L
), .Label = c("High Predictability", "Low Predictability"
), class = "factor"), SOA = structure(c(2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L), .Label = c("Long SOA", "Short SOA"), class = "factor"),
Time = c(-66.68, -66.68, -66.68, -66.68, -33.34, -33.34,
-33.34, -33.34, 0, 0, 0, 0, 33.34, 33.34, 33.34, 33.34, 66.68,
66.68, 66.68, 66.68, -66.68, -66.68, -66.68, -66.68, -33.34,
-33.34, -33.34, -33.34, 0, 0, 0, 0, 33.34, 33.34, 33.34,
33.34, 66.68, 66.68, 66.68, 66.68), Pcent_Chng = c(0.12314,
0.048254, -0.098007, 0.023216, 0.20327, 0.08338, -0.15157,
0.030008, 0.26442, 0.12019, -0.22878, 0.035547, 0.31849,
0.15488, -0.26887, 0.038992, 0.39489, 0.15112, -0.31185,
0.02144, NA, 0.046474, NA, 0.17541, NA, 0.14975, NA, 0.3555,
NA, -0.1736, NA, 0.72211, NA, -0.32201, NA, 1.0926, NA, -0.39551,
0.72211, 1.4406)), class = "data.frame", row.names = c(NA, -40L
), .Names = c("Subject", "Close", "SOA", "Time", "Pcent_Chng"
))
I get an error with the following attempt:
Data %>%
group_by(Subject,Close,SOA) %>%
summarize(Peak_Pcent = max(Pcent_Chng),
Peak_Latency = Time[which.max(Pcent_Chng)])
The error is:
Error in summarise_impl(.data, dots) :
Column `Peak_Latency` must be length 1 (a summary value), not 0
This seems to be due to the NA's, which are only in some SOA conditions. Using complete.cases() with my actual data is too aggressive and removes too much data.
Is there a workaround to ignore the NA's?

You have one group with Peak_Pcent all is NA, and the other group only with one Peak_Pcent. I think it is better to filter out the group with Peak_Pcent all is NA, and set na.rm = TRUE when using the max function.
Data %>%
group_by(Subject,Close,SOA) %>%
filter(!all(is.na(Pcent_Chng))) %>% # Filter out groups with Pcent_Chng all is NA
summarize(Peak_Pcent = max(Pcent_Chng, na.rm = TRUE), # Set na.rm = TRUE
Peak_Latency = Time[which.max(Pcent_Chng)])
# # A tibble: 7 x 5
# # Groups: Subject, Close [?]
# Subject Close SOA Peak_Pcent Peak_Latency
# <fctr> <fctr> <fctr> <dbl> <dbl>
# 1 1 High Predictability Long SOA 0.154880 33.34
# 2 1 High Predictability Short SOA 0.394890 66.68
# 3 1 Low Predictability Long SOA 0.038992 33.34
# 4 1 Low Predictability Short SOA -0.098007 -66.68
# 5 14 High Predictability Long SOA 0.149750 -33.34
# 6 14 Low Predictability Long SOA 1.440600 66.68
# 7 14 Low Predictability Short SOA 0.722110 66.68

This should do the trick:
Data %>%
group_by(Subject, Close, SOA) %>%
mutate(Peak_Pcent = max(Pcent_Chng)) %>%
arrange(Subject, Close, SOA) %>%
filter(Peak_Pcent == Pcent_Chng)
The output:
# A tibble: 6 x 6
# Groups: Subject, Close, SOA [6]
Subject Close SOA Time Pcent_Chng Peak_Pcent
<fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
1 1 High Predictability Long SOA 33.34 0.154880 0.154880
2 1 High Predictability Short SOA 66.68 0.394890 0.394890
3 1 Low Predictability Long SOA 33.34 0.038992 0.038992
4 1 Low Predictability Short SOA -66.68 -0.098007 -0.098007
5 14 High Predictability Long SOA -33.34 0.149750 0.149750
6 14 Low Predictability Long SOA 66.68 1.440600 1.440600

Related

How eliminate duplicate observation based on conditional and further creating a new column?

I have a dataframe with ~127000000 observations and 5 columns.
It looks like this:
structure(list(species = structure(c(1L, 2L, 3L, 1L, 3L, 5L,
4L, 5L, 6L, 6L, 7L, 8L), .Label = c("Acacia islana", "Acacia isoneura",
"Acacia iteaphylla", "Dianthus thunbergii", "Dianthus tlaratensis",
"Dianthus toletanus", "Dianthus transvaalensis", "Dianthus trifasciculatus"
), class = "factor"), stress = c(0.1355, -0.4237, -0.4237, NA,
NA, 0.1355, 0.3892, NA, -0.2328, NA, 0.26, -0.4043), location = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ATB",
"MCS"), class = "factor"), establishment = structure(c(2L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L), .Label = c("anthropogenic",
"natural"), class = "factor"), maturity = c(0.553891718, 0.335892308,
0.335892308, NA, NA, 0.553891718, 0.651435898, NA, 0.407958356,
NA, 0.602568113, 0.342996063)), class = "data.frame", row.names = c(NA,
-12L))
I would like to group by location and in each (170) I have some species that are both, anthropogenic and natural. When that occurs, I would like to eliminate that duplicate observation but with a condition: keep the establishment as anthropogenic but other columns (stress and maturity) keep using the natural data (because those observations have no NA's). In the end, I need to create a column called "richness" which is the sum of all anthropogenic and natural in each region adding that number in all observations.
The expected output would be something like that:
structure(list(species = structure(c(2L, 1L, 3L, 4L, 5L, 6L,
7L, 8L), .Label = c("Acacia islana", "Acacia isoneura", "Acacia iteaphylla",
"Dianthus thunbergii", "Dianthus tlaratensis", "Dianthus toletanus",
"Dianthus transvaalensis", "Dianthus trifasciculatus"), class = "factor"),
stress = c(-0.4237, 0.1355, -0.4237, 0.3892, 0.1355, -0.2328,
0.26, -0.4043), location = structure(c(2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L), .Label = c("ATB", "MCS"), class = "factor"),
establishment = structure(c(2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L
), .Label = c("anthropogenic", "natural"), class = "factor"),
maturity = c(0.335892308, 0.553891718, 0.335892308, 0.651435898,
0.553891718, 0.407958356, 0.602568113, 0.342996063), richness = c(3L,
3L, 3L, 5L, 5L, 5L, 5L, 5L)), class = "data.frame", row.names = c(NA,
-8L))
Looks that the dplyr can do the job, but as a beginner, I do not know how to do. I appreciate all your help.
You can change the value of establishment if there are duplicates present in the data. Keep the rows for non-NA values, finally add a new column with number of rows for each location.
library(dplyr)
df %>%
group_by(location, species) %>%
mutate(establishment = if(n() > 1) 'anthropogenic' else establishment) %>%
ungroup %>%
filter(!is.na(stress)) %>%
add_count(location, name = 'richness')
# species stress location establishment maturity richness
# <fct> <dbl> <fct> <chr> <dbl> <int>
#1 Acacia islana 0.136 MCS anthropogenic 0.554 3
#2 Acacia isoneura -0.424 MCS natural 0.336 3
#3 Acacia iteaphylla -0.424 MCS anthropogenic 0.336 3
#4 Dianthus tlaratensis 0.136 ATB anthropogenic 0.554 5
#5 Dianthus thunbergii 0.389 ATB natural 0.651 5
#6 Dianthus toletanus -0.233 ATB anthropogenic 0.408 5
#7 Dianthus transvaalensis 0.26 ATB natural 0.603 5
#8 Dianthus trifasciculatus -0.404 ATB natural 0.343 5

calculate median by groups with creating categorical variable in R

Here example of my data
dput(mydat)
structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
I want calculate the median for countlike by ID group
library(psych)
describeBy(mydat,mydat$ID.group)
but i didn't get needed result, i get all descriptive statistics.
How can i get results like
ID group median countlike
10150591 50
68837296 1
Then how calculate categorical variable for UserID?
For example. Median for ID group=10150591 is 50, then if userid=30240768 has value by countlike on 25% more than the median of this group then "red".
=50/100*25=12.5 25% percentage from 50=12.5. So 50+12.5=62.5, If userid=30240768 has value more then 62.5 by countlike then "red"
i.e. userid=30240768 has value 99. so he is "red".
If userid has value on 25% less than the median by this group then "green". 50-12.5=37.5, here not such value. And last, if value in range ±24% from median for group then "orange". 24% from 50 =50/100*24=12, so if userid has value by countlike 50 ± 12 (38-62) then "orange".
So desired output
ID group UserID countlike median countlike
10150591 181078814 44 orange
10150591 88578209 50 orange
10150591 30240768 99 red
68837296 334686951 1 green
68837296 297170412 1 green
68837296 265332359 15 red
How do I comply with such conditions?
Here is an answer using dplyr. We aggregate the data to medians, merge the medians with the original data, and then calculate color.
First, we read the dput() data from the OP and remove rows that are missing.
data <- structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
data <- data[!is.na(data$ID.group),]
Next, we load dplyr and calculate the desired output.
library(dplyr)
data %>% group_by(ID.group) %>%
summarise(.,mdn_countlike = median(countlike)) %>%
inner_join(.,data) %>%
mutate(color = case_when(countlike > 1.25 * mdn_countlike ~ "red",
countlike < 0.75 * mdn_countlike ~ "green",
countlike >= 0.75 * mdn_countlike &
countlike <= 1.25 * mdn_countlike ~ "orange")) -> mergedData
mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
...and the output:
> mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
# A tibble: 8 x 5
ID.group UserID countlike mdn_countlike color
<int> <int> <int> <dbl> <chr>
1 10150591 181078814 44 50 orange
2 10150591 88578209 50 50 orange
3 10150591 30240768 99 50 red
4 68837296 334686951 1 1 orange
5 68837296 297170412 1 1 orange
6 68837296 265332359 15 1 red
7 124771228 216632504 41 30.5 red
8 124771228 5272133 20 30.5 green
>

R - Convert List of Lists into single dataframe

So, I have created a list (and a single column matrix) that contains 256 nested lists. What I would like to do, is to convert each of the 256 lists into a single dataframe of 16 columns and then write.table it. Although each list contains the same number of columns (16), the number of rows for each list varies. I have tried to use unlist unsuccessfully because the changing row counts. I can subset each list individually, so I know there's an easier way to do the whole list.
I'm pretty new to R, so I apologize for asking what may be a naive novice question. I searched through a lot of topics the last couple days and didn't see anything that seemed to match my problem. for loop seems like it might be unnecessary and I wasn't sure if lapply was the correct route, either.
UPDATE: dput of first list:
list(structure(list(structure(c(2L, 11L, 15L, 8L, 7L, 3L, 6L, 10L,
1L, 1L, 18L, 13L, 14L, 19L, 16L, 17L, 4L, 5L, 9L, 12L), .Label = c("",
"Aaron Rodgers", "Andrew Quarless", "Derrick Coleman", "Doug Baldwin",
"DuJuan Harris", "Eddie Lacy", "James Starks", "Jermaine Kearse",
"John Kuhn", "Jordy Nelson", "Luke Willson", "Marshawn Lynch", "Percy
Harvin", "Randall Cobb", "Ricardo Lockette", "Robert Turbin",
"Russell Wilson", "Zach Miller"), class = "factor"), Tm =
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 4L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("GNB", "Passing", "SEA", "Tm"),
class = "factor"), Cmp = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "19",
"23", "Cmp", "Rushing"), class = "factor"), Att = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "28", "33", "Att", "Receiving"
), class = "factor"), Yds = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, NA, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("", "189", "191", "Yds"), class = "factor"),
TD = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "1",
"2", "TD"), class = "factor"), Int = structure(c(3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, NA, 4L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "0", "1", "Int"), class = "factor"),
Lng = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, 4L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "23",
"33", "Lng"), class = "factor"), Att = structure(c(1L, 1L,
1L, 7L, 3L, 1L, 2L, 2L, NA, 8L, 7L, 4L, 5L, 1L, 1L, 6L, 1L,
1L, 1L, 1L), .Label = c("", "1", "12", "20", "4", "6", "7",
"Att"), class = "factor"), Yds = structure(c(1L, 1L, 1L,
7L, 6L, 1L, 9L, 3L, NA, 10L, 5L, 2L, 8L, 1L, 1L, 4L, 1L,
1L, 1L, 1L), .Label = c("", "110", "2", "27", "29", "34",
"37", "41", "7", "Yds"), class = "factor"), TD = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 2L, 3L, NA, 5L, 2L, 4L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("", "0", "1", "2", "TD"), class = "factor"),
Lng = structure(c(1L, 1L, 1L, 2L, 4L, 1L, 8L, 6L, NA, 9L,
3L, 7L, 5L, 1L, 1L, 8L, 1L, 1L, 1L, 1L), .Label = c("", "12",
"13", "15", "16", "2", "21", "7", "Lng"), class = "factor"),
Rec = structure(c(1L, 7L, 5L, 3L, 4L, 4L, 1L, 1L, NA, 8L,
1L, 2L, 6L, 4L, 3L, 1L, 2L, 4L, 2L, 2L), .Label = c("", "1",
"2", "3", "6", "7", "9", "Rec"), class = "factor"), Yds = structure(c(1L,
12L, 9L, 3L, 3L, 6L, 1L, 1L, NA, 13L, 1L, 4L, 10L, 8L, 7L,
1L, 5L, 4L, 11L, 2L), .Label = c("", "1", "11", "14", "15",
"26", "38", "42", "58", "59", "8", "83", "Yds"), class = "factor"),
TD = structure(c(1L, 2L, 3L, 2L, 2L, 2L, 1L, 1L, NA, 4L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L), .Label = c("", "0",
"1", "TD"), class = "factor"), Lng = structure(c(1L, 7L,
9L, 3L, 4L, 8L, 1L, 1L, NA, 14L, 1L, 5L, 11L, 10L, 11L, 1L,
6L, 12L, 13L, 2L), .Label = c("", "1", "11", "12", "14",
"15", "16", "18", "23", "24", "33", "6", "8", "Lng"), class = "factor")), .Names = c("", "Tm", "Cmp", "Att", "Yds", "TD", "Int",
"Lng", "Att", "Yds", "TD", "Lng", "Rec", "Yds", "TD", "Lng"),
row.names = c(NA, -20L ), class = "data.frame"))
So, each observation in my list is like this above and I want to convert all of the lists into their 16 column(Now that I think about it, it's 17 columns, one is just unnamed) dataframe layout and stack all the rows together in one place that I can then write.table
Let's call your list l where l[[1]] is what you have dput above.
Two easy ways from base R and from data.table
do.call("rbind", l)
data.table::rbindlist(l)
This assumes that the columns match in each list element. Your example doesn't confirm this, although you state it.

Indicator feature creation in R based on multiple columns

I have a dataset with 10 columns and out of them 10, 3 are of interest to create a new indicator feature. The features are "pT", "pN", & "M" and they all take different values. Off all the values that these 3 features take, there are a toal of 9 unique combinations that needs to be captures in the new variable.
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
For example, the new variable will have value "1" when PATHOT=pT2, PATHON=pN1 & PATHOM=M0 and so on upto value 9. I have completed the task but after spending almost 20 lines of code involving vectorised operation for all unique combinations.
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
I want to ask if there is a better more automated way of getting the same result?
dput(data.frame) is given below
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
Thanks.
I tried to edit the data so it didn't throw an error on input. Also created a version of that tabulation of possible combinations:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
Make a vector of text-equivalents of the categories:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
Then the as.numeric values of a factor created using those levels will be the desired result:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
You can just assign those values in the usual way:
dat$sfd <- as.numeric(dat$stg)
I made some new data, that should be useful for your problem.
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
It's crude, and you're not picking which combination get's which numbers, so it'd take some digging afterwards, but it's quick.

Wrapping base R reshape for ease-of-use

It is a truth universally acknowledged that R's base reshape command is speedy and powerful but has miserable syntax. I have therefore written a quick wrapper around it which I will throw into the next release of the taRifx package. Before I did that, however, I want to solicit improvements.
Here's my version, with updates from #RichieCotton:
# reshapeasy: Version of reshape with way, way better syntax
# Written with the help of the StackOverflow R community
# x is a data.frame to be reshaped
# direction is "wide" or "long"
# vars are the names of the (stubs of) the variables to be reshaped (if omitted, defaults to everything not in id or vary)
# id are the names of the variables that identify unique observations
# vary is the variable that varies. Going to wide this variable will cease to exist. Going to long it will be created.
# omit is a vector of characters which are to be omitted if found at the end of variable names (e.g. price_1 becomes price in long)
# ... are options to be passed to stats::reshape
reshapeasy <- function( data, direction, id=(sapply(data,is.factor) | sapply(data,is.character)), vary=sapply(data,is.numeric), omit=c("_","."), vars=NULL, ... ) {
if(direction=="wide") data <- stats::reshape( data=data, direction=direction, idvar=id, timevar=vary, ... )
if(direction=="long") {
varying <- which(!(colnames(data) %in% id))
data <- stats::reshape( data=data, direction=direction, idvar=id, varying=varying, timevar=vary, ... )
}
colnames(data) <- gsub( paste("[",paste(omit,collapse="",sep=""),"]$",sep=""), "", colnames(data) )
return(data)
}
Note that you can move from wide to long without changing the options other than the direction. To me, this is the key to usability.
I'm happy to give acknowledgement in the function help files for any substantial improvements if you chat or e-mail me your info.
Improvements might fall in the following areas:
Naming the function and its arguments
Making it more general (currently it handles a fairly specific case, which I believe to be by far the most common, but it has not yet exhausted the capabilities of stats::reshape)
Code improvements
Examples
Sample data
x.wide <- structure(list(surveyNum = 1:6, pio_1 = structure(c(2L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_2 = structure(c(2L, 1L, 2L, 1L,
2L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_3 = structure(c(2L, 2L, 1L, 1L,
2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_1 = structure(c(2L, 1L, 1L,
2L, 1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_2 = structure(c(1L, 2L, 2L,
2L, 2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_3 = structure(c(1L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_1 = structure(c(1L, 2L, 2L, 1L,
1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_2 = structure(c(2L, 2L, 1L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_3 = structure(c(2L, 1L, 2L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), price_1 = structure(c(2L, 1L, 2L, 5L,
3L, 4L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_2 = structure(c(6L,
5L, 5L, 4L, 4L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_3 = structure(c(3L,
5L, 2L, 5L, 4L, 5L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor")), .Names = c("surveyNum",
"pio_1", "pio_2", "pio_3", "caremgmt_1", "caremgmt_2", "caremgmt_3",
"prev_1", "prev_2", "prev_3", "price_1", "price_2", "price_3"
), idvars = "surveyNum", rdimnames = list(structure(list(surveyNum = 1:24), .Names = "surveyNum", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"
), class = "data.frame"), structure(list(variable = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("pio",
"caremgmt", "prev", "price"), class = "factor"), .id = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L)), .Names = c("variable",
".id"), row.names = c("pio_1", "pio_2", "pio_3", "caremgmt_1",
"caremgmt_2", "caremgmt_3", "prev_1", "prev_2", "prev_3", "price_1",
"price_2", "price_3"), class = "data.frame")), row.names = c(NA,
6L), class = c("cast_df", "data.frame"))
x.long <- structure(list(.id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), pio = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
caremgmt = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L), .Label = c("1", "2"), class = "factor"), prev = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), price = structure(c(2L, 1L, 2L,
5L, 3L, 4L, 1L, 5L, 4L, 3L, 1L, 2L, 6L, 6L, 5L, 4L, 6L, 3L,
5L, 6L, 3L, 1L, 2L, 4L, 3L, 5L, 2L, 5L, 4L, 5L, 6L, 6L, 4L,
6L, 4L, 1L, 2L, 3L, 1L, 2L, 2L, 5L, 1L, 6L, 1L, 3L, 4L, 3L,
6L, 5L, 5L, 4L, 4L, 2L, 2L, 2L, 6L, 3L, 1L, 4L, 4L, 5L, 1L,
3L, 6L, 1L, 3L, 5L, 1L, 3L, 6L, 2L), .Label = c("1", "2",
"3", "4", "5", "6"), class = "factor"), surveyNum = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L)), .Names = c(".id",
"pio", "caremgmt", "prev", "price", "surveyNum"), row.names = c(NA,
-72L), class = "data.frame")
Examples
> x.wide
surveyNum pio_1 pio_2 pio_3 caremgmt_1 caremgmt_2 caremgmt_3 prev_1 prev_2 prev_3 price_1 price_2 price_3
1 1 2 2 2 2 1 1 1 2 2 2 6 3
2 2 2 1 2 1 2 2 2 2 1 1 5 5
3 3 1 2 1 1 2 1 2 1 2 2 5 2
4 4 2 1 1 2 2 2 1 2 2 5 4 5
5 5 1 2 2 1 2 1 1 1 1 3 4 4
6 6 1 2 1 2 1 1 2 1 1 4 2 5
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
surveyNum id pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
1.2 1 2 2 1 2 6
2.2 2 2 1 2 2 5
3.2 3 2 2 2 1 5
4.2 4 2 1 2 2 4
5.2 5 2 2 2 1 4
6.2 6 2 2 1 1 2
1.3 1 3 2 1 2 3
2.3 2 3 2 2 1 5
3.3 3 3 1 1 2 2
4.3 4 3 1 2 2 5
5.3 5 3 2 1 1 4
6.3 6 3 1 1 1 5
> head(x.long)
.id pio caremgmt prev price surveyNum
1 1 2 2 1 2 1
2 1 2 1 2 1 2
3 1 1 1 2 2 3
4 1 2 2 1 5 4
5 1 1 1 1 3 5
6 1 1 2 2 4 6
> head(reshapeasy( x.long, direction="wide", id="surveyNum", vary=".id" ))
surveyNum pio.1 caremgmt.1 prev.1 price.1 pio.3 caremgmt.3 prev.3 price.3 pio.2 caremgmt.2 prev.2 price.2
1 1 2 2 1 2 2 1 2 3 2 1 2 6
2 2 2 1 2 1 2 2 1 5 1 2 2 5
3 3 1 1 2 2 1 1 2 2 2 2 1 5
4 4 2 2 1 5 1 2 2 5 1 2 2 4
5 5 1 1 1 3 2 1 1 4 2 2 1 4
6 6 1 2 2 4 1 1 1 5 2 1 1 2
I would also like to see an option to order the output, since that's one of the things I don't like about reshape in base R. As an example, let's use the Stata Learning Module: Reshaping data wide to long, which you are already familiar with. The example I'm looking at is the "kids height and weight at age 1 and age 2" example.
Here's what I normally do with reshape():
# library(foreign)
kidshtwt = read.dta("http://www.ats.ucla.edu/stat/stata/modules/kidshtwt.dta")
kidshtwt.l = reshape(kidshtwt, direction="long", idvar=1:2,
varying=3:6, sep="", timevar="age")
# The reshaped data is correct, just not in the order I want it
# so I always have to do another step like this
kidshtwt.l = kidshtwt.l[order(kidshtwt.l$famid, kidshtwt.l$birth),]
Since this is an annoying step that I always have to go through when reshaping the data, I think it would be useful to add that into your function.
I also suggest at least having an option for doing the same thing with the final column order for reshaping from long to wide.
Example function for column ordering
I'm not sure of the best way to integrate this into your function, but I put this together to sort a data frame based on basic patterns for the variable names.
col.name.sort = function(data, patterns) {
a = names(data)
b = length(patterns)
subs = vector("list", b)
for (i in 1:b) {
subs[[i]] = sort(grep(patterns[i], a, value=T))
}
x = unlist(subs)
data[ , x ]
}
It can be used in the following manner. Imagine we had saved the output of your reshapeasy long to wide example as a data frame named a, and we wanted it ordered by "surveyNum", "caremgmt" (1-3), "prev" (1-3), "pio" (1-3), and "price" (1-3), we could use:
col.name.sort(a, c("sur", "car", "pre", "pio", "pri"))
Some initial thoughts:
I've always thought that the direction commands "wide" and "long" were a little fuzzy. Do they mean you want to convert the data to that format, or that the data is already in that format? It is something that you need to learn or look up. You can avoid that problem by having to separate functions reshapeToWide and reshapeToLong. As a bonus, the signature of each function has one less argument.
I don't think you meant to include the line
varying <- which(!(colnames(x.wide) %in% "surveyNum"))
since it refers to a specific dataset.
I prefer data to x for the first argument since it makes it clear that the input should be a data frame.
It is generally better form to have arguments without defaults first. So vars should come after id and vary.
Can you pick defaults for id and vary? reshape::melt defaults to factor and character columns for id and numeric columns for vary.
I think there might be a mistake in your example. For going from wide to long, I get the following error:
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
Error in gsub(paste("[", paste(omit, collapse = "", sep = ""), "]$", sep = ""), :
invalid regular expression '[]$', reason 'Missing ']''
Removing the NULL corrects the problem. Which leads me to ask, what is the intended purpose of that NULL?
I also think that the function would be improved if it generated a time variable by default, if not explicitly specified by the user (as is done in reshape()).
See, for instance, the following from base reshpae():
> head(reshape(x.wide, direction="long", idvar=1, varying=2:13, sep="_"))
surveyNum time pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
If I'm familiar with this, and I see that your function takes care of "varying" for me, I might be tempted to try:
> head(reshapeasy( x.wide, "long", id="surveyNum", sep="_" ))
Error in `row.names<-.data.frame`(`*tmp*`, value = paste(d[, idvar], times[1L], :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘1.1’
But that's not a very useful error. Perhaps including a custom error message might be useful for your final function.
Allowing the user to set vary to NULL, as you have done in your present version of the function, also doesn't seem wise to me. This yields output like this:
> head(reshapeasy( x.wide, "long", id="surveyNum", NULL, sep="_" ))
surveyNum pio caremgmt prev price
1.1 1 2 2 1 2
2.1 2 2 1 2 1
3.1 3 1 1 2 2
4.1 4 2 2 1 5
5.1 5 1 1 1 3
6.1 6 1 2 2 4
The problem with this output is that if I needed to reshape back to wide, I can't do it easily. Thus, I think that retaining reshape's default option of generating a time variable, but letting the user override that might be a useful feature.
Perhaps for those who are lazy and don't like to type the variable names, you can add the following to the head of your function:
if (is.numeric(id) == 1) {
id = colnames(data)[id]
} else if (is.numeric(id) == 0) {
id = id
}
if (is.numeric(vary) == 1) {
vary = colnames(data)[vary]
} else if (is.numeric(vary) == 0) {
vary = vary
}
Then, following with your examples, you can use the following shorthand:
reshapeasy(x.wide, direction="long", id=1, sep="_", vary="id")
reshapeasy(x.long, direction="wide", id=6, vary=1)
(I know, it might not be good practice since the code might be less readable or less easily understandable by someone later on, but it does happen frequently.)

Resources