Melting and converting badly labeled likert Scale R - r

on my survey I made a mistake for a 5 point likert scale as follows:
dput(head(edu_data))
structure(list(Education.1. = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "Y"), class = "factor"), Education.2. = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "Y"), class = "factor"),
Education.3. = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Y"), class = "factor"), Education.4. = structure(c(1L, 1L,
1L, 2L, 2L, 1L), .Label = c("", "Y"), class = "factor"),
Education.5. = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("",
"Y"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
I would like to change this into one column with a single value such that
answer_to_ls= 1:5
The output I want to get would be a column with a single number and that means getting rid of the letter. I do off course have a unique respondent's ID
Please tell me if I can somehow be more clear in the style of my question as I want to be a valuable member of the comunity.

I think there are a lot of potential solutions available, try a search of merging or collapsing multiple binary or dichotomous columns into a single column. For example:
R - Convert various dummy/logical variables into a single categorical variable/factor from their name
In your case, you could try something like:
edu_data$answer_to_ls <- apply(edu_data[1:5] == "Y", 1, function(x) { if (any(x)) { as.numeric(gsub(".*(\\d+).", "\\1", names(which(x)))) } else NA })
This will extract the number from the column name for the Likert scale response 1 to 5, make it a numeric value, and include NA if there are no "Y" responses. edu_data[1:5] selects those columns to consider for conversion, in this case columns 1 through 5.
Education.1. Education.2. Education.3. Education.4. Education.5. answer_to_ls
1 Y 5
2 Y 5
3 Y 5
4 Y 4
5 Y 4
6 NA

d <- structure(list(Education.1. = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Y"), class = "factor"),
Education.2. = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Y"), class = "factor"),
Education.3. = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Y"), class = "factor"),
Education.4. = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("", "Y"), class = "factor"),
Education.5. = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("", "Y"), class = "factor")),
row.names = c(NA, 6L), class = "data.frame")
d$item1 <- 1 * (d$Education.1 == "Y") +
2 * (d$Education.2 == "Y") +
3 * (d$Education.3 == "Y") +
4 * (d$Education.4 == "Y") +
5 * (d$Education.5 == "Y")
print(d)
leads to
> print(d)
Education.1. Education.2. Education.3. Education.4. Education.5. item1
1 Y 5
2 Y 5
3 Y 5
4 Y 4
5 Y 4
6 0

Related

Read excel file in R: problem with columns' labels and data/hour format

I have an excel file like this:
which I tried to read by using:
library(xlsx)
df <- read.xlsx("2021.xlsx", sheetIndex = 1)
However, I obtained a result that I do not like very much
> dput(df)
structure(list(Twitter = structure(c(3L, 1L, 1L, 2L, 2L), .Label = c("Jack",
"John", "User"), class = "factor"), NA. = structure(c(5L, 1L,
3L, 4L, 2L), .Label = c("Hello world", "Hello!", "I'm a text",
"I'm an example", "Tweet"), class = "factor"), NA..1 = structure(c(3L,
1L, 1L, 2L, 2L), .Label = c("44293", "44294", "Date"), class = "factor"),
NA..2 = structure(c(3L, 1L, 1L, 2L, 2L), .Label = c("0.490277777777778",
"0.552083333333333", "Hour"), class = "factor"), NA..3 = structure(c(3L,
1L, 1L, 2L, 2L), .Label = c("3", "4", "x"), class = "factor"),
NA..4 = structure(c(3L, 2L, 2L, 1L, 1L), .Label = c("6",
"7", "y"), class = "factor"), NA..5 = structure(c(3L, 2L,
2L, 1L, 2L), .Label = c("no", "yes", "z"), class = "factor")), class = "data.frame", row.names =
c(NA, -5L))
i.e.,
> df
Twitter NA. NA..1 NA..2 NA..3 NA..4 NA..5
1 User Tweet Date Hour x y z
2 Jack Hello world 44293 0.490277777777778 3 7 yes
3 Jack I'm a text 44293 0.490277777777778 3 7 yes
4 John I'm an example 44294 0.552083333333333 4 6 no
5 John Hello! 44294 0.552083333333333 4 6 yes
This is not the desired result. First, the date and the hour are wrong. Second, columns' labels are strange (Twitter, Na., NA..1 and so on). The correct labels are instead in the first rwo of the dataframe. I would like to obtain labels like, e.g., the following:
Twitter.User, Twitter.Tweet, Twitter.Date, Twitter.Hour, Twitter.x, Twitter.y, Twitter.z
Try read.xlsx("2021.xlsx", sheetIndex = 1, startRow = 2)

cut.default error in heatmap generation R

I want to generate a heatmap from a 8*6 dataframe. The last row in the dataframe has the information to annotate the columns. Structure of the dataframe is as follows:
heatmap_try <-structure(list(BGC0000041 = structure(c(1L, 2L, 1L, 1L, 1L, 3L
), .Label = c("0", "0.447458977", "a"), class = "factor"), BGC0000128 = structure(c(1L,
1L, 1L, 3L, 2L, 4L), .Label = c("0", "1.785875195", "4.093659107",
"a"), class = "factor"), BGC0000287 = structure(c(1L, 1L, 1L,
3L, 2L, 4L), .Label = c("0", "1.785875195", "4.456229186", "b"
), class = "factor"), BGC0000294 = structure(c(3L, 1L, 2L, 4L,
1L, 5L), .Label = c("0", "2.035046947", "3.230553742", "3.286304185",
"b"), class = "factor"), BGC0000295 = structure(c(1L, 1L, 1L,
2L, 1L, 3L), .Label = c("0", "2.286304185", "c"), class = "factor"),
BGC0000308 = structure(c(4L, 2L, 3L, 5L, 1L, 6L), .Label = c("6.277728291",
"6.313707588", "6.607936616", "6.622871165", "6.64385619",
"c"), class = "factor"), BGC0000323 = structure(c(1L, 2L,
1L, 1L, 1L, 3L), .Label = c("0", "0.447458977", "c"), class = "factor"),
BGC0000328 = structure(c(1L, 2L, 1L, 1L, 1L, 3L), .Label = c("0",
"0.447458977", "c"), class = "factor")), class = "data.frame", row.names = c("Gut",
"Oral", "Anterior_nares", "Retroauricular_crease", "Vagina",
"AL"))
My code for heatmap generation is as follows (I am using pheatmap library):
library(pheatmap)
heatmap_data1 <- heatmap_try[ c(1:5), c(1:8) ]
anotation_data <- as.data.frame(t(heatmap_try[6, ]))
row.names(anotation_data) <- colnames(heatmap_data1)
pheatmap(heatmap_data1, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)
However, I am getting the following error:
Error in cut.default(x, breaks = breaks, include.lowest = T) :
'x' must be numeric
What I am doing wrong?
Thanks!
This is because the columns of heatmap_data1 are factors, they need to be numeric. One way to convert is with:
heatmap_data1_num <- as.data.frame(lapply(heatmap_data1,
function(x) as.numeric(as.character(x))))
# then as before
pheatmap(heatmap_data1_num, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)

Why looping in 1 to unique value returns 1

I have the following data:
Class Identifier Configuration Total_individuals Total_goals Step 1 2 3 4 5
Class Identifier Configuration Total_individuals Total_goals Step Root IFNE IFNE IFEQ IFEQ
Class Identifier Configuration Total_individuals Total_goals Step Root true false true false
JDayChooser d6978cda No_Reduction 1000 208 1 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 2 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 3 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 4 0 11 11 11 11
JDayChooser d6978cda No_Reduction 1000 208 5 0 11 11 11 11
The first two lines give some information that will be used later, but for now I delete them. Then I need to use a loop with the limit of the number of Total_goals:
df <- read.csv("")
df <- df[-c(1:2), ] #to delete the first two lines
total_branches <- unique(df$Total_goals)
for(j in 1:total_branches){
print(j)
}
This gives the following results:
[1] 208
Levels: 208 Total_goals
[1] 1
First of all, why is it still printing the word Total_goals in Levels although I removed the lines that contain this value? Also, why the loop does not work? it only prints 1.
Reproducible data:
structure(list(Class = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L
), .Label = c("accessories.plugins.time.JDayChooser", "Class"
), class = "factor"), Identifier = structure(c(2L, 2L, 1L, 1L,
1L, 1L, 1L), .Label = c("d6978cda", "Identifier"), class = "factor"),
Configuration = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("Configuration",
"No_Reduction"), class = "factor"), Total_individuals = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("1000", "Total_individuals"
), class = "factor"), Total_goals = structure(c(2L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("208", "Total_goals"), class = "factor"),
Step = structure(c(6L, 6L, 1L, 2L, 3L, 4L, 5L), .Label = c("1",
"2", "3", "4", "5", "Step"), class = "factor"), X1 = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "Root"), class = "factor"),
X2 = structure(c(2L, 3L, 1L, 1L, 1L, 1L, 1L), .Label = c("11",
"IFNE", "true"), class = "factor"), X3 = structure(c(3L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("11", "false", "IFNE"
), class = "factor"), X4 = structure(c(2L, 3L, 1L, 1L, 1L,
1L, 1L), .Label = c("11", "IFEQ", "true"), class = "factor"),
X5 = structure(c(3L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("11",
"false", "IFEQ"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
The answer to both of your question is because the columns is of class factor.
When you do :
df <- df[-c(1:2), ]
You remove the rows but the factor levels are still there.
levels(df$Total_goals)
#[1] "208" "Total_goals"
To get rid of that you need to use droplevels.
df <- droplevels(df[-c(1:2), ])
levels(df$Total_goals)
#[1] "208"
Now even if you have dropped the level Total_goals is still a factor. To convert to numeric do
df$Total_goals <- as.numeric(as.character(df$Total_goals))
and then run the for loop
total_branches <- unique(df$Total_goals)
for(j in 1:total_branches){
print(j)
}

Comparing the position of 1's is matched in the strings in r

Suppose I am reading a .csv file from R whose columns contain strings of 0s and 1s. Suppose I need to compare the position of 1's and if matched then count as 1 per match and put that count in the third column.
Illustration:
dput(head(string_data))
structure(list(v_1 = structure(c(1L, 1L, 1L, 1L, 3L, 1L), .Label = c("",
"0,0,0,1", "0,0,1,0", "0,1,0,0", "1,1,0,0"), class = "factor"),
v_2 = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("",
"1,0,1,0"), class = "factor"), v_3 = structure(c(1L, 1L,
1L, 1L, 4L, 1L), .Label = c("", "0,0,0,1", "0,0,1,0", "1,0,0,0"
), class = "factor"), v_4 = structure(c(1L, 1L, 1L, 1L, 2L,
1L), .Label = c("", "0,0,0,1"), class = "factor"), v_5 = structure(c(1L,
5L, 1L, 1L, 1L, 2L), .Label = c("", "0,0,0,0,0", "0,0,0,1,0",
"0,0,1,0,0", "1,0,1,1,0"), class = "factor"), v_6 = structure(c(1L,
2L, 1L, 1L, 1L, 2L), .Label = c("", "1,0,1,1,0"), class = "factor"),
v_7 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("",
"0,0,0,0", "0,0,0,1", "0,1,0,0", "1,0,0,0"), class = "factor"),
v_8 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("",
"1,0,0,0"), class = "factor")), .Names = c("v_1", "v_2",
"v_3", "v_4", "v_5", "v_6", "v_7", "v_8"), row.names = c(NA,
6L), class = "data.frame")
Above I have pasted dput of head data.
I need to compare the position of 1's in (2*i-1) column with (2*i)th column (i =1,2,...,8) and put that in a third column. as number of matches.
e.g.
Suppose I have a string 0,0,1,1 in first column and 0,1,1,1 in second column then in the third column it should return 2.
Can anyone please help me out with this one.
EDIT
The counting in the third column should be based on the number of 1's in the second column string. In above e.g. second column string is 0,1,1,1 which implies it the count can very from 0 to 3.
This couple of functions might be helpful as a starter:
# Compares two strings and computes number of '1's at matching positions
f <- function(s1, s2) {
if (s1=='' || s2=='') return(0)
m <- do.call(cbind,strsplit(c(s1,s2),','))
m2 <- rowMeans(m=="1")
sum(m2==1.0)
}
# Calls `f()` for every row of two columns i and j from a data set d and returns a vector
# that could be used as a new column
f.cols <- function(d,i,j) {
c1 <- as.character(d[,i])
c2 <- as.character(d[,j])
unname(mapply(f,c1,c2))
}
Example of use:
d$out <- f.cols(d,1,2)

What reshaping problems can melt/cast not solve in a single step?

reshape2 is a package which allows an powerful array of data transformations, through its two-part melt/cast approach. However, like all tools it embeds assumptions which limit the cases it can handle.
What data reshaping problem can reshape2 not handle in its current form?
The ideal answer will include:
A description of the type of use cases where this data shape is typically found
Sample data
Code to accomplish the transformation (ideally using as much of the transformation with reshape2 as possible)
Example
"Wide" data is common in panel applications.
melt.wide <- function(data, id.vars, new.names, sep=".", variable.name="variable", ... ) {
# Guess number of variables currently wide
colnames(data) <- sub( paste0(sep,"$"), "", colnames(data) )
wide.vars <- colnames(data)[grep( sep, colnames(data) )]
n.wide <- str_count( wide.vars, sep )
stopifnot(length(new.names)==unique(n.wide))
# Melt
data.melt <- melt(data,id.vars=id.vars,measure.vars=wide.vars,...)
new <- stack.list(str_split(data.melt$variable,sep))
colnames(new) <- c(variable.name,new.names)
data.melt <- subset(data.melt,select=c(-variable))
cbind(data.melt,new)
}
choice.vars <- colnames(res)[grep("_",colnames(res))]
melt.wide( subset(res,select=c("WorkerId",choice.vars)), id.vars="WorkerId", new.names=c("set","option"), sep="_")
The new function returns a melted object that can then be *cast.
Where the data is:
so <- structure(list(WorkerId = c(12L, 13L, 27L, 25L, 30L, 8L), pio_1_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_1_3 = structure(c(1L, 1L,
1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_2_1 = structure(c(1L, 2L, 2L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_2_2 = structure(c(1L, 1L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_2_3 = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), pio_3_1 = structure(c(2L,
2L, 2L, 2L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_3_2 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_3_3 = structure(c(2L, 1L,
2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_4_1 = structure(c(2L, 1L, 2L, 2L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_2 = structure(c(2L, 2L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_4_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_1_1 = structure(c(2L,
2L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_2_1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_2_3 = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_3_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_2 = structure(c(2L, 1L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_3_3 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_4_1 = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_2 = structure(c(2L,
2L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_4_3 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_1_1 = structure(c(1L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_2 = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_1_3 = structure(c(2L, 1L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_2_1 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_2 = structure(c(2L, 2L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_2_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_3_2 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_3_3 = structure(c(2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_4_1 = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_4_2 = structure(c(1L, 1L,
2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_4_3 = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), price_1_1 = structure(c(30L,
12L, 1L, 16L, 28L, 17L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,512", "$2,513", "$2,515", "$2,526", "$2,547",
"$2,548", "$2,578", "$2,588", "$2,594", "$2,605", "$2,607",
"$2,617", "$2,618", "$2,622", "$2,635", "$2,649", "$2,670",
"$2,672", "$2,679", "$2,681", "$2,698", "$2,704", "$2,721",
"$2,782", "$2,851", "$2,884", "$2,919", "$2,925", "$2,935",
"$3,022"), class = "factor"), price_1_2 = structure(c(1L,
19L, 5L, 17L, 7L, 1L), .Label = c("$2,500", "$2,501", "$2,502",
"$2,504", "$2,513", "$2,515", "$2,517", "$2,532", "$2,535",
"$2,558", "$2,564", "$2,571", "$2,575", "$2,578", "$2,608",
"$2,633", "$2,634", "$2,675", "$2,678", "$2,687", "$2,730",
"$2,806", "$2,827", "$2,848", "$2,891", "$2,901", "$2,923",
"$2,933", "$2,937", "$2,958", "$2,987"), class = "factor"),
price_1_3 = structure(c(11L, 1L, 1L, 8L, 19L, 14L), .Label = c("$2,500",
"$2,504", "$2,507", "$2,513", "$2,516", "$2,518", "$2,564",
"$2,579", "$2,580", "$2,583", "$2,584", "$2,592", "$2,604",
"$2,608", "$2,639", "$2,643", "$2,646", "$2,665", "$2,667",
"$2,695", "$2,698", "$2,709", "$2,710", "$2,713", "$2,714",
"$2,750", "$2,757", "$2,876", "$2,978", "$2,984", "$3,024",
"$3,059"), class = "factor"), price_1_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_2_1 = structure(c(27L, 32L, 19L, 22L, 4L, 26L), .Label = c("$2,500",
"$2,504", "$2,505", "$2,510", "$2,511", "$2,512", "$2,515",
"$2,517", "$2,518", "$2,529", "$2,533", "$2,537", "$2,551",
"$2,553", "$2,574", "$2,593", "$2,600", "$2,605", "$2,608",
"$2,612", "$2,613", "$2,618", "$2,639", "$2,657", "$2,714",
"$2,730", "$2,747", "$2,764", "$2,771", "$2,773", "$2,813",
"$2,859", "$2,901", "$3,019", "$3,037"), class = "factor"),
price_2_2 = structure(c(12L, 2L, 1L, 27L, 1L, 7L), .Label = c("$2,500",
"$2,502", "$2,510", "$2,514", "$2,515", "$2,516", "$2,517",
"$2,518", "$2,520", "$2,521", "$2,523", "$2,536", "$2,544",
"$2,575", "$2,583", "$2,592", "$2,602", "$2,624", "$2,644",
"$2,652", "$2,662", "$2,677", "$2,720", "$2,761", "$2,765",
"$2,770", "$2,772", "$2,835", "$2,873", "$2,911", "$2,950",
"$2,962"), class = "factor"), price_2_3 = structure(c(32L,
1L, 8L, 33L, 29L, 11L), .Label = c("$2,500", "$2,506", "$2,507",
"$2,510", "$2,511", "$2,512", "$2,515", "$2,517", "$2,527",
"$2,528", "$2,540", "$2,554", "$2,562", "$2,565", "$2,568",
"$2,581", "$2,597", "$2,611", "$2,616", "$2,631", "$2,652",
"$2,663", "$2,671", "$2,672", "$2,685", "$2,727", "$2,731",
"$2,742", "$2,771", "$2,778", "$2,781", "$2,970", "$2,984",
"$2,986", "$3,030"), class = "factor"), price_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_3_1 = structure(c(24L, 1L, 28L, 7L, 18L, 21L), .Label = c("$2,500",
"$2,501", "$2,503", "$2,505", "$2,509", "$2,512", "$2,535",
"$2,537", "$2,542", "$2,553", "$2,556", "$2,560", "$2,561",
"$2,574", "$2,584", "$2,618", "$2,624", "$2,629", "$2,637",
"$2,664", "$2,761", "$2,840", "$2,875", "$2,883", "$2,891",
"$2,933", "$2,953", "$2,978", "$3,039", "$3,043", "$3,067"
), class = "factor"), price_3_2 = structure(c(3L, 1L, 5L,
19L, 25L, 9L), .Label = c("$2,500", "$2,501", "$2,503", "$2,504",
"$2,512", "$2,517", "$2,540", "$2,543", "$2,546", "$2,560",
"$2,567", "$2,573", "$2,586", "$2,592", "$2,594", "$2,603",
"$2,604", "$2,606", "$2,628", "$2,633", "$2,635", "$2,693",
"$2,696", "$2,714", "$2,734", "$2,739", "$2,770", "$2,791",
"$2,797", "$2,936", "$2,967", "$3,021", "$3,024"), class = "factor"),
price_3_3 = structure(c(26L, 7L, 5L, 32L, 10L, 24L), .Label = c("$2,500",
"$2,501", "$2,502", "$2,505", "$2,506", "$2,507", "$2,508",
"$2,509", "$2,512", "$2,515", "$2,519", "$2,547", "$2,556",
"$2,574", "$2,587", "$2,592", "$2,608", "$2,616", "$2,621",
"$2,635", "$2,638", "$2,667", "$2,671", "$2,688", "$2,694",
"$2,700", "$2,717", "$2,759", "$2,809", "$2,864", "$2,891",
"$2,912", "$3,011", "$3,012"), class = "factor"), price_3_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_4_1 = structure(c(29L, 13L, 16L, 24L, 33L, 19L), .Label = c("$2,500",
"$2,505", "$2,506", "$2,508", "$2,511", "$2,525", "$2,549",
"$2,562", "$2,577", "$2,582", "$2,586", "$2,591", "$2,621",
"$2,636", "$2,654", "$2,670", "$2,722", "$2,726", "$2,733",
"$2,744", "$2,745", "$2,755", "$2,768", "$2,805", "$2,817",
"$2,827", "$2,835", "$2,888", "$2,925", "$2,959", "$3,001",
"$3,027", "$3,061", "$3,071"), class = "factor"), price_4_2 = structure(c(33L,
31L, 21L, 16L, 25L, 13L), .Label = c("$2,500", "$2,502",
"$2,503", "$2,505", "$2,506", "$2,511", "$2,513", "$2,516",
"$2,529", "$2,539", "$2,547", "$2,554", "$2,557", "$2,562",
"$2,567", "$2,579", "$2,581", "$2,583", "$2,585", "$2,591",
"$2,612", "$2,629", "$2,640", "$2,670", "$2,695", "$2,726",
"$2,737", "$2,788", "$2,790", "$2,798", "$2,852", "$3,031",
"$3,063"), class = "factor"), price_4_3 = structure(c(4L,
30L, 4L, 19L, 1L, 27L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,511", "$2,512", "$2,514", "$2,516", "$2,543",
"$2,552", "$2,562", "$2,575", "$2,578", "$2,581", "$2,594",
"$2,614", "$2,615", "$2,617", "$2,636", "$2,640", "$2,641",
"$2,652", "$2,749", "$2,755", "$2,805", "$2,812", "$2,867",
"$2,906", "$2,910", "$2,917", "$2,924", "$2,927", "$2,961",
"$3,028", "$3,053", "$3,054"), class = "factor"), price_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
plan_1_1 = structure(c(2L, 2L, 2L, 1L, 1L, 2L), .Label = c("",
"X"), class = "factor"), plan_1_2 = structure(c(1L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_1 = structure(c(1L, 2L, 1L,
2L, 2L, 2L), .Label = c("", "X"), class = "factor"), plan_2_2 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_2_3 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_4 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_3_2 = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_3_3 = structure(c(2L, 1L, 1L,
1L, 1L, 2L), .Label = c("", "X"), class = "factor"), plan_3_4 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_4_1 = structure(c(2L, 2L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_4_2 = structure(c(2L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_4_3 = structure(c(1L,
1L, 1L, 1L, 2L, 2L), .Label = c("", "X"), class = "factor"),
plan_4_4 = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor")), .Names = c("WorkerId", "pio_1_1",
"pio_1_2", "pio_1_3", "pio_1_4", "pio_2_1", "pio_2_2", "pio_2_3",
"pio_2_4", "pio_3_1", "pio_3_2", "pio_3_3", "pio_3_4", "pio_4_1",
"pio_4_2", "pio_4_3", "pio_4_4", "caremgmt_1_1", "caremgmt_1_2",
"caremgmt_1_3", "caremgmt_1_4", "caremgmt_2_1", "caremgmt_2_2",
"caremgmt_2_3", "caremgmt_2_4", "caremgmt_3_1", "caremgmt_3_2",
"caremgmt_3_3", "caremgmt_3_4", "caremgmt_4_1", "caremgmt_4_2",
"caremgmt_4_3", "caremgmt_4_4", "prev_1_1", "prev_1_2", "prev_1_3",
"prev_1_4", "prev_2_1", "prev_2_2", "prev_2_3", "prev_2_4", "prev_3_1",
"prev_3_2", "prev_3_3", "prev_3_4", "prev_4_1", "prev_4_2", "prev_4_3",
"prev_4_4", "price_1_1", "price_1_2", "price_1_3", "price_1_4",
"price_2_1", "price_2_2", "price_2_3", "price_2_4", "price_3_1",
"price_3_2", "price_3_3", "price_3_4", "price_4_1", "price_4_2",
"price_4_3", "price_4_4", "plan_1_1", "plan_1_2", "plan_1_3",
"plan_1_4", "plan_2_1", "plan_2_2", "plan_2_3", "plan_2_4", "plan_3_1",
"plan_3_2", "plan_3_3", "plan_3_4", "plan_4_1", "plan_4_2", "plan_4_3",
"plan_4_4"), row.names = c(NA, 6L), class = "data.frame")
... almost a year later...
This came to mind the other day, and I have a sneaking suspicion that it is what you tried to show in your example, but unfortunately, your example code doesn't run!
melt sometimes takes things a bit too far for me when making my data "long". Sometimes, even though it is not what would necessarily be called "tidy data", I prefer to have a "semi-long" data.frame. This is easily achieved using base R's reshape, but requires a few extra steps with the "reshape2" package, as demonstrated below:
Prerequisite: sample data.
set.seed(1)
myDf <- data.frame(
ID.1 = sample(letters[1:5], 5, replace = TRUE),
ID.2 = 1:5,
V.1 = sample(10:14, 5, replace = TRUE),
V.2 = sample(5:9, 5, replace = TRUE),
V.3 = sample(3:14, 5, replace = TRUE),
W.1 = sample(LETTERS, 5, replace = TRUE),
W.2 = sample(LETTERS, 5, replace = TRUE),
W.3 = sample(LETTERS, 5, replace = TRUE)
)
myDf
# ID.1 ID.2 V.1 V.2 V.3 W.1 W.2 W.3
# 1 b 1 14 6 8 Y K M
# 2 b 2 14 5 11 F A P
# 3 c 3 13 8 14 Q J M
# 4 e 4 13 6 7 D W E
# 5 b 5 10 8 12 G I V
The "semi-long" output that I'm looking for. Easily achieved with base R's reshape.
reshape(myDf, direction = "long", idvar=1:2, varying = 3:ncol(myDf))
# ID.1 ID.2 time V W
# b.1.1 b 1 1 14 Y
# b.2.1 b 2 1 14 F
# c.3.1 c 3 1 13 Q
# e.4.1 e 4 1 13 D
# b.5.1 b 5 1 10 G
# b.1.2 b 1 2 6 K
# b.2.2 b 2 2 5 A
# c.3.2 c 3 2 8 J
# e.4.2 e 4 2 6 W
# b.5.2 b 5 2 8 I
# b.1.3 b 1 3 8 M
# b.2.3 b 2 3 11 P
# c.3.3 c 3 3 14 M
# e.4.3 e 4 3 7 E
# b.5.3 b 5 3 12 V
melt is great if you wanted the equivalent of stack, especially since stack discards all factor variables, which is frustrating when read.table and family defaults to stringsAsFactors = TRUE. (You can make it work, but you need to convert the relevant columns to character before you can use stack). But, it is not what I'm looking for, in particular because of how it has handled the "variable" column.
library(reshape2)
myDfL <- melt(myDf, id.vars=1:2)
head(myDfL)
# ID.1 ID.2 variable value
# 1 b 1 V.1 14
# 2 b 2 V.1 14
# 3 c 3 V.1 13
# 4 e 4 V.1 13
# 5 b 5 V.1 10
# 6 b 1 V.2 6
To fix this, one needs to first split the "variable" column, and then use dcast to get the same format of output as you would get from reshape.
myDfL <- cbind(myDfL, colsplit(myDfL$variable, "\\.", names=c("var", "time")))
dcast(myDfL, ID.1 + ID.2 + time ~ var, value.var="value")
# ID.1 ID.2 time V W
# 1 b 1 1 14 Y
# 2 b 1 2 6 K
# 3 b 1 3 8 M
# 4 b 2 1 14 F
# 5 b 2 2 5 A
# 6 b 2 3 11 P
# 7 b 5 1 10 G
# 8 b 5 2 8 I
# 9 b 5 3 12 V
# 10 c 3 1 13 Q
# 11 c 3 2 8 J
# 12 c 3 3 14 M
# 13 e 4 1 13 D
# 14 e 4 2 6 W
# 15 e 4 3 7 E

Resources