Related
I've used the following code to create a frequency count.
df %>% group_by(INCOME, HAPPY) %>% summarise(count=n())
Output:
INCOME HAPPY count
<int> <int> <int>
1 1 1 6
2 1 2 17
3 1 3 13
4 1 8 1
5 2 1 5
6 2 2 11
7 2 3 12
8 2 8 0
9 3 1 4
10 3 2 10
11 3 3 5
12 3 8 0
Yet, I would like to have the following frequency format.
1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
Using xtabs from base R
xtabs(count ~ HAPPY + INCOME, df1)
INCOME
HAPPY 1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
data
df1 <- structure(list(INCOME = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), HAPPY = c(1L, 2L, 3L, 8L, 1L, 2L, 3L, 8L, 1L, 2L,
3L, 8L), count = c(6L, 17L, 13L, 1L, 5L, 11L, 12L, 0L, 4L, 10L,
5L, 0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12"))
After your code: df %>% group_by(INCOME, HAPPY) %>% summarise(count=n())
You could use this code to achieve your task:
library(dplyr)
library(tidyr)
library(tibble)
df %>%
mutate(group_id = as.integer(gl(n(), 4, n()))) %>%
pivot_wider(
HAPPY,
names_from = group_id,
values_from = count
) %>%
column_to_rownames("HAPPY")
1 2 3
1 6 5 4
2 17 11 10
3 13 12 5
8 1 0 0
data:
structure(list(INCOME = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L), HAPPY = c(1L, 2L, 3L, 8L, 1L, 2L, 3L, 8L, 1L, 2L,
3L, 8L), count = c(6L, 17L, 13L, 1L, 5L, 11L, 12L, 0L, 4L, 10L,
5L, 0L)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12"))
I think this can be simplified to -
library(dplyr)
library(tidyr)
df %>%
count(INCOME, HAPPY) %>%
pivot_wider(names_from = INCOME, values_from = n)
I have 2 dataframe df_1 and df_2. Now I have to select some rows randomly from df_1 and then I will merge the rest of the rows (which not selected randomly) from df_1 with df_2.
I am using this code
set.seed(9999)
df_1 <- # the whole dataset
test_dataset1 <- sample_n(df_1, 10)
train_part_1 <- df_1[which(!df_1 %in% test_dataset1)] # Not working
train_1 <- rbind(df_2, train_part_1)
But, when I am trying to extract the rows not selected randomly. My code is not working. I am getting the same data as the df_1 means 20 rows (same dataset)
Edited: Actually, I have to make 3 test and 3 train datasets. So, how can I use the seed function to get the same dataset for reproduce purposes?
Reproducible data (only df_1):
structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L,
9L, 3L, 4L, 2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), .Label = c("ID00309",
"ID00361", "ID00541", "ID00570", "ID00615", "ID00696", "ID00762",
"ID01200", "ID05109"), class = "factor"), nodeB = structure(c(8L,
3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 9L, 7L, 4L, 4L, 6L, 9L, 6L,
7L, 5L, 5L), .Label = c("ID00361", "ID00541", "ID00570", "ID00615",
"ID00696", "ID01200", "ID05109", "ID11641", "ID11691"), class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133", "1.95883206119256",
"2.08440437841349", "2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727", "2.50432367561777",
"2.57541320006514", "2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011", "3.21209741517806",
"3.21997803385465", "3.48788394772132", "3.81389707587156"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
Get your sample using random row numbers and the use - to get the inverse:
df_1 <- structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L, 9L, 3L, 4L,
2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L),
.Label = c("ID00309", "ID00361", "ID00541",
"ID00570", "ID00615", "ID00696",
"ID00762", "ID01200", "ID05109"),
class = "factor"),
nodeB = structure(c(8L, 3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L,
9L, 7L, 4L, 4L, 6L, 9L, 6L, 7L, 5L, 5L),
.Label = c("ID00361", "ID00541", "ID00570",
"ID00615", "ID00696", "ID01200",
"ID05109", "ID11641", "ID11691"),
class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133",
"1.95883206119256", "2.08440437841349",
"2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727",
"2.50432367561777", "2.57541320006514",
"2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011",
"3.21209741517806", "3.21997803385465",
"3.48788394772132", "3.81389707587156"
), class = "factor")),
class = "data.frame", row.names = c(NA, -20L))
set.seed(9999)
Selected <- sample.int(nrow(df_1), 10)
# index selected the row; use [col,row] pattern to select rows
test_dataset1 <- df_1[ Selected, ]
# use -index to remove rows
train_part_1 <- df_1[-Selected, ]
test_dataset1
#> nodeA nodeB scr
#> 6 ID00570 ID05109 2.93717640337986
#> 9 ID00541 ID11641 2.57541320006514
#> 19 ID00361 ID00696 1.90444166064472
#> 3 ID00309 ID00570 3.21997803385465
#> 10 ID00570 ID01200 2.50432367561777
#> 2 ID00361 ID00570 3.48788394772132
#> 20 ID00309 ID00696 1.85284606048794
#> 8 ID05109 ID11641 2.65099330092281
#> 12 ID01200 ID05109 2.46120775935034
#> 18 ID00696 ID05109 1.90762235378507
train_part_1
#> nodeA nodeB scr
#> 1 ID00570 ID11641 3.81389707587156
#> 4 ID00309 ID00361 3.21209741517806
#> 5 ID00309 ID00541 2.99596628688011
#> 7 ID00309 ID11691 2.75209155741549
#> 11 ID00361 ID11691 2.49647215035727
#> 13 ID00361 ID00615 2.3223132020942
#> 14 ID00309 ID00615 2.26408172709962
#> 15 ID00615 ID01200 2.08440437841349
#> 16 ID00762 ID11691 1.95883206119256
#> 17 ID00541 ID01200 1.94364188077133
Created on 2021-03-14 by the reprex package (v1.0.0)
I have a txt.file like this:
0003 MPARTNER SALZ S 150112 22:30:45 160304 08:38:13 2 BUY 2 BUY 12380 165426 150109 08:00:00
0003 SPROTTSE HUGHES S 140407 02:30:50 141120 13:55:06 2 BUY 2 BUY 3764 57379 140401 10:05:00
0003 SPROTTSE HUGHES S 141223 09:06:13 160715 08:42:56 3 MARKETPERFORM 3 HOLD 3764 57379 141223 08:02:00
001V MPARTNER PEARLSTEIN D 140821 02:44:05 150312 09:17:13 2 BUY 2 BUY 12380 163717 140820 08:16:00
001V MPARTNER PEARLSTEIN D 151016 15:07:40 160411 08:40:35 2 BUY 2 BUY 12380 163717 151009 08:12:00
001W CANACCOR K 140321 04:06:40 140609 23:06:44 SPECULATIVE BUY 1 STRONG BUY 406 150412 140319 23:19:00
001W CANACCOR WRIGHT K 140714 12:47:31 160228 22:57:45 BUY 1 STRONG BUY 406 150412 140714 12:38:00
001W CLARUS OFIR E 140515 11:40:00 150515 09:27:09 SPECULATIVE BUY 1 STRONG BUY 202 115944 140515 11:40:00
001W CLARUS MACKAY D 150813 09:40:45 160812 09:40:02 BUY 1 STRONG BUY 202 73763 150813 09:23:00
001W DEACON OFIR E 150119 22:03:46 170328 06:45:14 1 BUY 1 STRONG BUY 704 115944 150112 07:24:00
001W DEACON OFIR E 171115 06:48:47 171115 06:48:47 1 BUY 1 STRONG BUY 704 115944 171115 06:42:00
#70L MORGAN MARTINEZ J 100226 07:12:51 100708 04:51:16 8 EQUALWT/NO RATING 3 HOLD 1595 56947 100226 07:12:00
#70L MORGAN MARTINEZ DE O J 100708 05:09:02 100910 00:48:28 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 100708 03:14:00
#70L MORGAN MARTINEZ DE O J 100910 21:16:07 101110 21:55:52 2 OVERWT/IN-LINE 2 BUY 1595 56947 100910 19:18:00
#70L MORGAN OLCOZ CERDAN J 101112 01:32:41 120618 21:04:56 2 OVERWT/IN-LINE 2 BUY 1595 56947 101111 20:03:00
#70L MORGAN OLCOZ CERDAN J 120712 03:19:26 131216 19:49:59 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 120711 19:20:00
#70L MORGAN OLCOZ CERDAN J 140226 22:20:19 150417 13:07:31 2 OVERWT/IN-LINE 2 BUY 1595 56947 140226 22:20:00
#70L MORGAN J 150608 01:25:35 171106 00:16:05 1 OVERWT/ATTRACTIVE 2 BUY 1595 56947 150608 01:25:00
And I would like to produce a table in R with the same structure as in the txt file with the apparent 16 columns.
I tried to use the codes:
max(count.fields("BSP.txt", sep="")) # 18 columns
df= read.delim("BSP.txt", sep = "" ,header = FALSE,col.names = c("V1", "VS","V3", "V4", "V5","V6",
"V7", "V8", "V9", "V10",
"V11", "V12", "V13", "V14",
"V15","V16","V17","V18"))
But I received a weirdly structured table:
structure(list(V1 = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("#70L", "0003",
"001V", "001W"), class = "factor"), VS = structure(c(5L, 6L,
6L, 5L, 5L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L
), .Label = c("CANACCOR", "CLARUS", "DEACON", "MORGAN", "MPARTNER",
"SPROTTSE"), class = "factor"), V3 = structure(c(9L, 1L, 1L,
8L, 8L, 3L, 10L, 6L, 4L, 6L, 6L, 5L, 5L, 5L, 7L, 7L, 7L, 2L), .Label = c("HUGHES",
"J", "K", "MACKAY", "MARTINEZ", "OFIR", "OLCOZ", "PEARLSTEIN",
"SALZ", "WRIGHT"), class = "factor"), V4 = structure(c(9L, 9L,
9L, 4L, 4L, 1L, 8L, 6L, 4L, 6L, 6L, 7L, 5L, 5L, 3L, 3L, 3L, 2L
), .Label = c("140321", "150608", "CERDAN", "D", "DE", "E", "J",
"K", "S"), class = "factor"), V5 = structure(c(9L, 4L, 8L, 7L,
12L, 2L, 6L, 5L, 11L, 10L, 13L, 3L, 15L, 15L, 14L, 14L, 14L,
1L), .Label = c("01:25:35", "04:06:40", "100226", "140407", "140515",
"140714", "140821", "141223", "150112", "150119", "150813", "151016",
"171115", "J", "O"), class = "factor"), V6 = structure(c(16L,
1L, 5L, 2L, 13L, 12L, 9L, 8L, 6L, 15L, 3L, 4L, 17L, 17L, 7L,
10L, 11L, 14L), .Label = c("02:30:50", "02:44:05", "06:48:47",
"07:12:51", "09:06:13", "09:40:45", "101112", "11:40:00", "12:47:31",
"120712", "140226", "140609", "15:07:40", "171106", "22:03:46",
"22:30:45", "J"), class = "factor"), V7 = structure(c(10L, 6L,
12L, 7L, 11L, 17L, 9L, 8L, 13L, 14L, 15L, 4L, 4L, 5L, 2L, 3L,
16L, 1L), .Label = c("00:16:05", "01:32:41", "03:19:26", "100708",
"100910", "141120", "150312", "150515", "160228", "160304", "160411",
"160715", "160812", "170328", "171115", "22:20:19", "23:06:44"
), class = "factor"), V8 = structure(c(5L, 13L, 7L, 8L, 6L, 18L,
17L, 9L, 10L, 3L, 4L, 1L, 2L, 16L, 12L, 14L, 15L, 11L), .Label = c("04:51:16",
"05:09:02", "06:45:14", "06:48:47", "08:38:13", "08:40:35", "08:42:56",
"09:17:13", "09:27:09", "09:40:02", "1", "120618", "13:55:06",
"131216", "150417", "21:16:07", "22:57:45", "SPECULATIVE"), class = "factor"),
V9 = structure(c(6L, 6L, 8L, 6L, 6L, 10L, 10L, 12L, 10L,
1L, 1L, 9L, 2L, 3L, 7L, 5L, 4L, 11L), .Label = c("1", "100910",
"101110", "13:07:31", "19:49:59", "2", "21:04:56", "3", "8",
"BUY", "OVERWT/ATTRACTIVE", "SPECULATIVE"), class = "factor"),
V10 = structure(c(6L, 6L, 8L, 6L, 6L, 2L, 2L, 6L, 2L, 6L,
6L, 7L, 1L, 4L, 3L, 5L, 3L, 3L), .Label = c("00:48:28", "1",
"2", "21:55:52", "6", "BUY", "EQUALWT/NO", "MARKETPERFORM"
), class = "factor"), V11 = structure(c(2L, 2L, 3L, 2L, 2L,
9L, 9L, 1L, 9L, 1L, 1L, 8L, 4L, 2L, 7L, 6L, 7L, 5L), .Label = c("1",
"2", "3", "6", "BUY", "EQUALWT/IN-LINE", "OVERWT/IN-LINE",
"RATING", "STRONG"), class = "factor"), V12 = structure(c(4L,
4L, 6L, 4L, 4L, 4L, 4L, 8L, 4L, 8L, 8L, 3L, 5L, 7L, 2L, 3L,
2L, 1L), .Label = c("1595", "2", "3", "BUY", "EQUALWT/IN-LINE",
"HOLD", "OVERWT/IN-LINE", "STRONG"), class = "factor"), V13 = structure(c(1L,
5L, 5L, 1L, 1L, 6L, 6L, 8L, 3L, 8L, 8L, 9L, 4L, 2L, 8L, 9L,
8L, 7L), .Label = c("12380", "2", "202", "3", "3764", "406",
"56947", "BUY", "HOLD"), class = "factor"), V14 = structure(c(5L,
7L, 7L, 4L, 4L, 1L, 1L, 6L, 9L, 8L, 8L, 3L, 11L, 10L, 3L,
3L, 3L, 2L), .Label = c("150412", "150608", "1595", "163717",
"165426", "202", "57379", "704", "73763", "BUY", "HOLD"), class = "factor"),
V15 = structure(c(8L, 4L, 7L, 6L, 10L, 3L, 5L, 2L, 9L, 2L,
2L, 12L, 11L, 11L, 12L, 12L, 12L, 1L), .Label = c("01:25:00",
"115944", "140319", "140401", "140714", "140820", "141223",
"150109", "150813", "151009", "1595", "56947"), class = "factor"),
V16 = structure(c(2L, 7L, 3L, 5L, 4L, 16L, 10L, 13L, 6L,
14L, 15L, 8L, 17L, 17L, 9L, 11L, 12L, 1L), .Label = c("",
"08:00:00", "08:02:00", "08:12:00", "08:16:00", "09:23:00",
"10:05:00", "100226", "101111", "12:38:00", "120711", "140226",
"140515", "150112", "171115", "23:19:00", "56947"), class = "factor"),
V17 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 1L, 4L,
2L, 3L, 5L, 6L, 9L, 8L, 10L, 1L), .Label = c("", "06:42:00",
"07:12:00", "07:24:00", "100708", "100910", "11:40:00", "19:20:00",
"20:03:00", "22:20:00"), class = "factor"), V18 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L,
1L, 1L), .Label = c("", "03:14:00", "19:18:00"), class = "factor")), .Names = c("V1",
"VS", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14", "V15", "V16", "V17", "V18"), class = "data.frame", row.names = c(NA,
-18L))
As stated above, I would like to receive a table with 16 columns with the structure in the txt.file. Even the empty fields (e.g. in Row 6) should remain.
E.g for Row 6:
Can you help me on this?
many thanks.
One option is to use read.fwf
df <- read.fwf("tst.txt", widths = c(8, 10, 14, 28, 7, 10, 7, 10, 7, 29, 3,
21, 9, 8, 7, 8), header = FALSE)
#Now next part will be to remove the leading/training whitespaces from character fields.
library(dplyr)
df <- df %>% mutate_if(is.factor, function(x)trimws(as.character(x)))
The data frame looks as:
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
# 1 0003 MPARTNER SALZ S 150112 22:30:45 160304 08:38:13 2 BUY 2 BUY 12380 165426 150109 08:00:00
# 2 0003 SPROTTSE HUGHES S 140407 02:30:50 141120 13:55:06 2 BUY 2 BUY 3764 57379 140401 10:05:00
# 3 0003 SPROTTSE HUGHES S 141223 09:06:13 160715 08:42:56 3 MARKETPERFORM 3 HOLD 3764 57379 141223 08:02:00
# 4 001V MPARTNER PEARLSTEIN D 140821 02:44:05 150312 09:17:13 2 BUY 2 BUY 12380 163717 140820 08:16:00
# 5 001V MPARTNER PEARLSTEIN D 151016 15:07:40 160411 08:40:35 2 BUY 2 BUY 12380 163717 151009 08:12:00
# 6 001W CANACCOR K 140321 04:06:40 140609 23:06:44 NA SPECULATIVE BUY 1 STRONG BUY 406 150412 140319 23:19:00
# 7 001W CANACCOR WRIGHT K 140714 12:47:31 160228 22:57:45 NA BUY 1 STRONG BUY 406 150412 140714 12:38:00
# 8 001W CLARUS OFIR E 140515 11:40:00 150515 09:27:09 NA SPECULATIVE BUY 1 STRONG BUY 202 115944 140515 11:40:00
# 9 001W CLARUS MACKAY D 150813 09:40:45 160812 09:40:02 NA BUY 1 STRONG BUY 202 73763 150813 09:23:00
# 10 001W DEACON OFIR E 150119 22:03:46 170328 06:45:14 1 BUY 1 STRONG BUY 704 115944 150112 07:24:00
# 11 001W DEACON OFIR E 171115 06:48:47 171115 06:48:47 1 BUY 1 STRONG BUY 704 115944 171115 06:42:00
# 12 #70L MORGAN MARTINEZ J 100226 07:12:51 100708 04:51:16 8 EQUALWT/NO RATING 3 HOLD 1595 56947 100226 07:12:00
# 13 #70L MORGAN MARTINEZ DE O J 100708 05:09:02 100910 00:48:28 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 100708 03:14:00
# 14 #70L MORGAN MARTINEZ DE O J 100910 21:16:07 101110 21:55:52 2 OVERWT/IN-LINE 2 BUY 1595 56947 100910 19:18:00
# 15 #70L MORGAN OLCOZ CERDAN J 101112 01:32:41 120618 21:04:56 2 OVERWT/IN-LINE 2 BUY 1595 56947 101111 20:03:00
# 16 #70L MORGAN OLCOZ CERDAN J 120712 03:19:26 131216 19:49:59 6 EQUALWT/IN-LINE 3 HOLD 1595 56947 120711 19:20:00
# 17 #70L MORGAN OLCOZ CERDAN J 140226 22:20:19 150417 13:07:31 2 OVERWT/IN-LINE 2 BUY 1595 56947 140226 22:20:00
# 18 #70L MORGAN J 150608 01:25:35 171106 00:16:05 1 OVERWT/ATTRACTIVE 2 BUY 1595 56947 150608 01:25:00
The above data.frame got 16 columns and 18 rows.
That's my data:
> head(data)
id C1 C2 C3 B1 B2 B3 Name
12 3 12 8 1 3 12 Agar
14 4 11 9 5 12 14 LB
18 7 17 6 7 14 16 YEF
20 9 15 4 3 11 17 KAN
so I used a melt function from reshape2 package to reorganize my data. Now it looks like that:
dt <- melt(data, measure.vars=2:7)
> head(dt)
n v variable value rt
1 id Name p C1 1
2 12 Agar p 3 2
3 14 LB p 4 3
4 18 YEF p 7 6
5 20 KAN p 9 3
6 id Name u C2 1
I did some calculations on my data and now there is an extra column. Let's call it "rt". I'd like to transform my data now to the previous "state" with this an extra column. Do you know any function which would be useful ?
dput(dt)
structure(list(n = structure(c(5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L), class = "factor", .Label = c("12", "14",
"18", "20", "id")), v = structure(c(4L, 1L, 3L, 5L, 2L, 4L, 1L,
3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L,
5L, 2L, 4L, 1L, 3L, 5L, 2L), class = "factor", .Label = c("Agar",
"KAN", "LB", "Name", "YEF")), variable = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), .Label = c("p",
"u", "k", "l", "t", "h"), class = "factor"), value = c("C1",
"3", "4", "7", "9", "C2", "12", "11", "17", "15", "C3", "8",
"9", "6", "4", "B1", "1", "5", "7", "3", "B2", "3", "12", "14",
"11", "B3", "12", "14", "16", "17")), .Names = c("n", "v", "variable",
"value"), row.names = c(NA, -30L), class = "data.frame")
In the "reshape2" universe, melt and *cast go hand-in-hand.
Here's an example of melting a data.frame and dcasting it back to its original form. You would need to take a similar approach with your data.
mydf <- data.frame(A = LETTERS[1:3], B = 1:3, C = 4:6)
mydf
# A B C
# 1 A 1 4
# 2 B 2 5
# 3 C 3 6
library(reshape2)
mDF <- melt(mydf, id.vars="A")
mDF
dcast(mDF, A ~ variable, value.var="value")
# A B C
# 1 A 1 4
# 2 B 2 5
# 3 C 3 6
In the dcast step, think of the items before the ~ as being the "id" variables, and those coming after as being the resulting column names. value.var should be the column from which the values will fill in the resulting "grid" created by the id variables and column names.
It is a truth universally acknowledged that R's base reshape command is speedy and powerful but has miserable syntax. I have therefore written a quick wrapper around it which I will throw into the next release of the taRifx package. Before I did that, however, I want to solicit improvements.
Here's my version, with updates from #RichieCotton:
# reshapeasy: Version of reshape with way, way better syntax
# Written with the help of the StackOverflow R community
# x is a data.frame to be reshaped
# direction is "wide" or "long"
# vars are the names of the (stubs of) the variables to be reshaped (if omitted, defaults to everything not in id or vary)
# id are the names of the variables that identify unique observations
# vary is the variable that varies. Going to wide this variable will cease to exist. Going to long it will be created.
# omit is a vector of characters which are to be omitted if found at the end of variable names (e.g. price_1 becomes price in long)
# ... are options to be passed to stats::reshape
reshapeasy <- function( data, direction, id=(sapply(data,is.factor) | sapply(data,is.character)), vary=sapply(data,is.numeric), omit=c("_","."), vars=NULL, ... ) {
if(direction=="wide") data <- stats::reshape( data=data, direction=direction, idvar=id, timevar=vary, ... )
if(direction=="long") {
varying <- which(!(colnames(data) %in% id))
data <- stats::reshape( data=data, direction=direction, idvar=id, varying=varying, timevar=vary, ... )
}
colnames(data) <- gsub( paste("[",paste(omit,collapse="",sep=""),"]$",sep=""), "", colnames(data) )
return(data)
}
Note that you can move from wide to long without changing the options other than the direction. To me, this is the key to usability.
I'm happy to give acknowledgement in the function help files for any substantial improvements if you chat or e-mail me your info.
Improvements might fall in the following areas:
Naming the function and its arguments
Making it more general (currently it handles a fairly specific case, which I believe to be by far the most common, but it has not yet exhausted the capabilities of stats::reshape)
Code improvements
Examples
Sample data
x.wide <- structure(list(surveyNum = 1:6, pio_1 = structure(c(2L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_2 = structure(c(2L, 1L, 2L, 1L,
2L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_3 = structure(c(2L, 2L, 1L, 1L,
2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_1 = structure(c(2L, 1L, 1L,
2L, 1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_2 = structure(c(1L, 2L, 2L,
2L, 2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_3 = structure(c(1L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_1 = structure(c(1L, 2L, 2L, 1L,
1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_2 = structure(c(2L, 2L, 1L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_3 = structure(c(2L, 1L, 2L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), price_1 = structure(c(2L, 1L, 2L, 5L,
3L, 4L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_2 = structure(c(6L,
5L, 5L, 4L, 4L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_3 = structure(c(3L,
5L, 2L, 5L, 4L, 5L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor")), .Names = c("surveyNum",
"pio_1", "pio_2", "pio_3", "caremgmt_1", "caremgmt_2", "caremgmt_3",
"prev_1", "prev_2", "prev_3", "price_1", "price_2", "price_3"
), idvars = "surveyNum", rdimnames = list(structure(list(surveyNum = 1:24), .Names = "surveyNum", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"
), class = "data.frame"), structure(list(variable = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("pio",
"caremgmt", "prev", "price"), class = "factor"), .id = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L)), .Names = c("variable",
".id"), row.names = c("pio_1", "pio_2", "pio_3", "caremgmt_1",
"caremgmt_2", "caremgmt_3", "prev_1", "prev_2", "prev_3", "price_1",
"price_2", "price_3"), class = "data.frame")), row.names = c(NA,
6L), class = c("cast_df", "data.frame"))
x.long <- structure(list(.id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), pio = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
caremgmt = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L), .Label = c("1", "2"), class = "factor"), prev = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), price = structure(c(2L, 1L, 2L,
5L, 3L, 4L, 1L, 5L, 4L, 3L, 1L, 2L, 6L, 6L, 5L, 4L, 6L, 3L,
5L, 6L, 3L, 1L, 2L, 4L, 3L, 5L, 2L, 5L, 4L, 5L, 6L, 6L, 4L,
6L, 4L, 1L, 2L, 3L, 1L, 2L, 2L, 5L, 1L, 6L, 1L, 3L, 4L, 3L,
6L, 5L, 5L, 4L, 4L, 2L, 2L, 2L, 6L, 3L, 1L, 4L, 4L, 5L, 1L,
3L, 6L, 1L, 3L, 5L, 1L, 3L, 6L, 2L), .Label = c("1", "2",
"3", "4", "5", "6"), class = "factor"), surveyNum = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L)), .Names = c(".id",
"pio", "caremgmt", "prev", "price", "surveyNum"), row.names = c(NA,
-72L), class = "data.frame")
Examples
> x.wide
surveyNum pio_1 pio_2 pio_3 caremgmt_1 caremgmt_2 caremgmt_3 prev_1 prev_2 prev_3 price_1 price_2 price_3
1 1 2 2 2 2 1 1 1 2 2 2 6 3
2 2 2 1 2 1 2 2 2 2 1 1 5 5
3 3 1 2 1 1 2 1 2 1 2 2 5 2
4 4 2 1 1 2 2 2 1 2 2 5 4 5
5 5 1 2 2 1 2 1 1 1 1 3 4 4
6 6 1 2 1 2 1 1 2 1 1 4 2 5
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
surveyNum id pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
1.2 1 2 2 1 2 6
2.2 2 2 1 2 2 5
3.2 3 2 2 2 1 5
4.2 4 2 1 2 2 4
5.2 5 2 2 2 1 4
6.2 6 2 2 1 1 2
1.3 1 3 2 1 2 3
2.3 2 3 2 2 1 5
3.3 3 3 1 1 2 2
4.3 4 3 1 2 2 5
5.3 5 3 2 1 1 4
6.3 6 3 1 1 1 5
> head(x.long)
.id pio caremgmt prev price surveyNum
1 1 2 2 1 2 1
2 1 2 1 2 1 2
3 1 1 1 2 2 3
4 1 2 2 1 5 4
5 1 1 1 1 3 5
6 1 1 2 2 4 6
> head(reshapeasy( x.long, direction="wide", id="surveyNum", vary=".id" ))
surveyNum pio.1 caremgmt.1 prev.1 price.1 pio.3 caremgmt.3 prev.3 price.3 pio.2 caremgmt.2 prev.2 price.2
1 1 2 2 1 2 2 1 2 3 2 1 2 6
2 2 2 1 2 1 2 2 1 5 1 2 2 5
3 3 1 1 2 2 1 1 2 2 2 2 1 5
4 4 2 2 1 5 1 2 2 5 1 2 2 4
5 5 1 1 1 3 2 1 1 4 2 2 1 4
6 6 1 2 2 4 1 1 1 5 2 1 1 2
I would also like to see an option to order the output, since that's one of the things I don't like about reshape in base R. As an example, let's use the Stata Learning Module: Reshaping data wide to long, which you are already familiar with. The example I'm looking at is the "kids height and weight at age 1 and age 2" example.
Here's what I normally do with reshape():
# library(foreign)
kidshtwt = read.dta("http://www.ats.ucla.edu/stat/stata/modules/kidshtwt.dta")
kidshtwt.l = reshape(kidshtwt, direction="long", idvar=1:2,
varying=3:6, sep="", timevar="age")
# The reshaped data is correct, just not in the order I want it
# so I always have to do another step like this
kidshtwt.l = kidshtwt.l[order(kidshtwt.l$famid, kidshtwt.l$birth),]
Since this is an annoying step that I always have to go through when reshaping the data, I think it would be useful to add that into your function.
I also suggest at least having an option for doing the same thing with the final column order for reshaping from long to wide.
Example function for column ordering
I'm not sure of the best way to integrate this into your function, but I put this together to sort a data frame based on basic patterns for the variable names.
col.name.sort = function(data, patterns) {
a = names(data)
b = length(patterns)
subs = vector("list", b)
for (i in 1:b) {
subs[[i]] = sort(grep(patterns[i], a, value=T))
}
x = unlist(subs)
data[ , x ]
}
It can be used in the following manner. Imagine we had saved the output of your reshapeasy long to wide example as a data frame named a, and we wanted it ordered by "surveyNum", "caremgmt" (1-3), "prev" (1-3), "pio" (1-3), and "price" (1-3), we could use:
col.name.sort(a, c("sur", "car", "pre", "pio", "pri"))
Some initial thoughts:
I've always thought that the direction commands "wide" and "long" were a little fuzzy. Do they mean you want to convert the data to that format, or that the data is already in that format? It is something that you need to learn or look up. You can avoid that problem by having to separate functions reshapeToWide and reshapeToLong. As a bonus, the signature of each function has one less argument.
I don't think you meant to include the line
varying <- which(!(colnames(x.wide) %in% "surveyNum"))
since it refers to a specific dataset.
I prefer data to x for the first argument since it makes it clear that the input should be a data frame.
It is generally better form to have arguments without defaults first. So vars should come after id and vary.
Can you pick defaults for id and vary? reshape::melt defaults to factor and character columns for id and numeric columns for vary.
I think there might be a mistake in your example. For going from wide to long, I get the following error:
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
Error in gsub(paste("[", paste(omit, collapse = "", sep = ""), "]$", sep = ""), :
invalid regular expression '[]$', reason 'Missing ']''
Removing the NULL corrects the problem. Which leads me to ask, what is the intended purpose of that NULL?
I also think that the function would be improved if it generated a time variable by default, if not explicitly specified by the user (as is done in reshape()).
See, for instance, the following from base reshpae():
> head(reshape(x.wide, direction="long", idvar=1, varying=2:13, sep="_"))
surveyNum time pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
If I'm familiar with this, and I see that your function takes care of "varying" for me, I might be tempted to try:
> head(reshapeasy( x.wide, "long", id="surveyNum", sep="_" ))
Error in `row.names<-.data.frame`(`*tmp*`, value = paste(d[, idvar], times[1L], :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘1.1’
But that's not a very useful error. Perhaps including a custom error message might be useful for your final function.
Allowing the user to set vary to NULL, as you have done in your present version of the function, also doesn't seem wise to me. This yields output like this:
> head(reshapeasy( x.wide, "long", id="surveyNum", NULL, sep="_" ))
surveyNum pio caremgmt prev price
1.1 1 2 2 1 2
2.1 2 2 1 2 1
3.1 3 1 1 2 2
4.1 4 2 2 1 5
5.1 5 1 1 1 3
6.1 6 1 2 2 4
The problem with this output is that if I needed to reshape back to wide, I can't do it easily. Thus, I think that retaining reshape's default option of generating a time variable, but letting the user override that might be a useful feature.
Perhaps for those who are lazy and don't like to type the variable names, you can add the following to the head of your function:
if (is.numeric(id) == 1) {
id = colnames(data)[id]
} else if (is.numeric(id) == 0) {
id = id
}
if (is.numeric(vary) == 1) {
vary = colnames(data)[vary]
} else if (is.numeric(vary) == 0) {
vary = vary
}
Then, following with your examples, you can use the following shorthand:
reshapeasy(x.wide, direction="long", id=1, sep="_", vary="id")
reshapeasy(x.long, direction="wide", id=6, vary=1)
(I know, it might not be good practice since the code might be less readable or less easily understandable by someone later on, but it does happen frequently.)