Related
This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 1 year ago.
I have a table which has 5 columns (ID, var, state, loc and position). The var column contains a description of a certain variant e.g. var1. Within the table there are multiple rows which include var 1 but they have a different state and position. What I want to do is make a new table where each var is included only once and the position is included in two columns based on its state.
For example, say I have four var1 rows; two with the state H and two with the state h. In the new table I need the columns to be: sample - var - loc - position if H and position if h - such that all the information for var 1 is in one row. I would need to be able to do this for every single variant in my original data set.
Current data example
structure(list(ID = c(1234L, 1234L, 1234L, 1234L, 5678L, 5678L,
NA, NA, NA, NA), var = c("var1", "var1", "var1", "var1", "var2",
"var2", NA, NA, NA, NA), state = c("H", "H", "h", "h", "H", "h",
NA, NA, NA, NA), loc = c(4L, 4L, 4L, 4L, 12L, 12L, NA, NA, NA,
NA), position = c(6000L, 6002L, 6004L, 6006L, 3002L, 3004L, NA,
NA, NA, NA)), row.names = c("1", "2", "3", "4", "5", "6", "NA",
"NA.1", "NA.2", "NA.3"), class = "data.frame")
wanted format
structure(list(V1 = c("ID", "1234", "5678", NA, NA, NA, NA, NA,
NA, NA), V2 = c("var1", "var1", "var2", NA, NA, NA, NA, NA, NA,
NA), V3 = c("loc", "4", "12", NA, NA, NA, NA, NA, NA, NA), V4 = c("state H",
"6000 6002", "3002", NA, NA, NA, NA, NA, NA, NA), V5 = c("state h",
"6004 6006", "3004", NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"2", "3", "NA", "NA.1", "NA.2", "NA.3", "NA.4", "NA.5", "NA.6"
), class = "data.frame")
Any guidance would be appreciate
The answer to your question is likely revolving around tidyr::pivot_wider
I changed the example data because I believe yours was inconsistent.
Data
df<-structure(list(ID = c(1234L, 1234L, 1234L, 1234L, 5678L, 5678L
), var = c("var1", "var1", "var1", "var1", "var2", "var2"), state = c("H",
"H", "h", "h", "H", "h"), loc = c(4L, 4L, 4L, 4L, 12L, 12L),
position = c(6000L, 6002L, 6004L, 6006L, 3002L, 3004L)), row.names = c("1",
"2", "3", "4", "5", "6"), class = "data.frame")
df
ID var state loc position
1 1234 var1 H 4 6000
2 1234 var1 H 4 6002
3 1234 var1 h 4 6004
4 1234 var1 h 4 6006
5 5678 var2 H 12 3002
6 5678 var2 h 12 3004
Answer
library(tidyr)
df %>% pivot_wider(names_from = state,
values_from = position,
values_fn = toString)
# A tibble: 2 × 5
ID var loc H h
<int> <chr> <int> <chr> <chr>
1 1234 var1 4 6000, 6002 6004, 6006
2 5678 var2 12 3002 3004
I Am trying to automate the below R code in which I am calculating the p values. The data is in csv format.
I have clicks and open number for each section and their version.
If someone can help with applying any loop or something.
Data I have in .csv format:
` Total Clicks
Section Version A Version B Version C Version D
Section1 1,999 2,116 2,307 2,568
Section2 3,450 1,781 3,416 1,399
Section3 1,773 915 1,744 644
Section4 0 2,255 0 1,432
Section5 588 573 721 235
Main email 7,222 7,067 7,467 6,043
Total email 7,810 7,640 8,188 6,278
`
`Version # Opens
A 9,073
B 9,150
C 9,215
D 9,153
`
Currently I am assigning the data manually in the below format:
` S1_Click_A=1,999 ####(section 1, email A)
S1_Click_B=2,116 ## (section 1, email B)
S1_Click_C=2,307
S1_Click_D=2,568
S2_Click_A=3,450
S2_Click_B=1,781
.
.
.
S5_Click_C=721
S5_Click_D=235
MainBody_Click_A=7,222
MainBody_Click_B=7,067
.
.
TotalEmail_Click_C=8,188
TotalEmail_Click_D=6,278
`
`# to test % total click is the comparable across versions`
`# section 1 test
S1ab <- prop.test(x = c(S1_Click_A,S1_Click_B), n = c(Open_A,Open_B))
S1ac <- prop.test(x = c(S1_Click_A,S1_Click_C), n = c(Open_A,Open_C))
S1ad <- prop.test(x = c(S1_Click_A,S1_Click_D), n = c(Open_A,Open_D))
S1bc <- prop.test(x = c(S1_Click_B,S1_Click_C), n = c(Open_B,Open_C))
S1bd <- prop.test(x = c(S1_Click_B,S1_Click_D), n = c(Open_B,Open_D))
S1cd <- prop.test(x = c(S1_Click_C,S1_Click_D), n = c(Open_C,Open_D))
`
`#section 2 test
S2ab <- prop.test(x = c(S2_Click_A,S2_Click_B), n = c(Open_A,Open_B))
S2ac <- prop.test(x = c(S2_Click_A,S2_Click_C), n = c(Open_A,Open_C))
```
S2cd <- prop.test(x = c(S2_Click_C,S2_Click_D), n = c(Open_C,Open_D))
`
`#section 3 test
S3ab <- prop.test(x = c(S3_Click_A,S3_Click_B), n = c(Open_A,Open_B))
S3ac <- prop.test(x = c(S3_Click_A,S3_Click_C), n = c(Open_A,Open_C))
```
S3cd <- prop.test(x = c(S3_Click_C,S3_Click_D), n = c(Open_C,Open_D))`
`#section 4 test
S4ab <- prop.test(x = c(S4_Click_A,S4_Click_B), n = c(Open_A,Open_B))
S4ac <- prop.test(x = c(S4_Click_A,S4_Click_C), n = c(Open_A,Open_C))
`#section 5 test
S5ab <- prop.test(x = c(S5_Click_A,S5_Click_B), n = c(Open_A,Open_B))
S5ac <- prop.test(x = c(S5_Click_A,S5_Click_C), n = c(Open_A,Open_C))
`#Main body test
MainBodyab <- prop.test(x = c(MainBody_Click_A,MainBody_Click_B), n = c(Open_A,Open_B))
MainBodyac <- prop.test(x = c(MainBody_Click_A,MainBody_Click_C), n = c(Open_A,Open_C))
`
```
`
`#FINAL P VALUE`
`S1ab$p.value
S1ac$p.value
S1ad$p.value
`
I expect:
1. I want to read the data in the above format. I mean reading the
section 1 version A data i.e 1,999 and assigning the same to
S1_Click_A=1,999 similarly for others.
2. a matrix with their clicks and p values in a single row.
dput()
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
dput for final format
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L,
7L), .Label = c("Main email body", "Section 1", "Section 2",
"Section 3", "Section 4", "Section 5", "Total email"), class = "factor"),
Version.A = structure(c(3L, 4L, 2L, 1L, 5L, 6L, 7L), .Label = c("0",
"1,773", "1,999", "3,450", "588", "7,222", "7,810"), class = "factor"),
Version.B = structure(c(2L, 1L, 7L, 3L, 4L, 5L, 6L), .Label = c("1,781",
"2,116", "2,255", "573", "7,067", "7,640", "915"), class = "factor"),
Version.C = structure(c(3L, 4L, 2L, 1L, 6L, 5L, 7L), .Label = c("0",
"1,744", "2,307", "3,416", "7,467", "721", "8,188"), class = "factor"),
Version.D = structure(c(3L, 1L, 7L, 2L, 4L, 5L, 6L), .Label = c("1,399",
"1,432", "2,568", "235", "6,043", "6,278", "644"), class = "factor"),
A.vs..B = c(NA, NA, NA, NA, NA, NA, NA), A.vs..C = c(NA,
NA, NA, NA, NA, NA, NA), A.vs..D = c(NA, NA, NA, NA, NA,
NA, NA), B.vs..C = c(NA, NA, NA, NA, NA, NA, NA), B.vs..D = c(NA,
NA, NA, NA, NA, NA, NA), C.vs..D = c(NA, NA, NA, NA, NA,
NA, NA)), class = "data.frame", row.names = c(NA, -7L))
Here is a solution for first section, same principle for the others.
First generate the combinations then apply the test on them.
df <- structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
opens <- data.frame(A = 9073, B = 9150, C = 9215, D = 9153)
s1_comb <- combn(colnames(df)[-1], 2)
open_comb <- combn(colnames(opens), 2)
res <- cbind(df,
matrix(NA,
nrow = nrow(df),
ncol = ncol(open_comb),
dimnames = list(1:nrow(df), paste(open_comb[1,], "vs", open_comb[2,]))
)
)
for(k in 1:ncol(s1_comb)){
for(o in 1:(nrow(df)-2)){
res[o, paste(open_comb[1, k], "vs", open_comb[2, k])] <- prop.test(x = unlist(df[o, s1_comb[,k]]),
n = unlist(opens[open_comb[,k]]))$p.value
}
}
res
Final output, as requested :
> res
Section Version.A Version.B Version.C Version.D A vs B A vs C A vs D B vs C B vs D
1 Section 1 2967 3353 495 559 2.452892e-08 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
2 Section 2 4840 4522 285 266 1.259231e-07 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
3 Section 3 2508 2250 228 205 2.961113e-06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
4 Section 4 2093 1333 209 133 1.081110e-48 0.000000e+00 0.000000e+00 4.567813e-198 6.505394e-234
5 Section 5 1117 925 186 154 2.756287e-06 7.420214e-161 3.232226e-174 1.051130e-116 4.618885e-129
6 Main email body 12408 11458 282 260 NA NA NA NA NA
7 Total email 13525 12383 271 248 NA NA NA NA NA
C vs D
1 3.472031e-02
2 4.850847e-01
3 3.178608e-01
4 5.557843e-05
5 1.022220e-01
6 NA
7 NA
You should have a look into RMarkdown, which can be used to create fully reproducible reports.
You basically write a script, the script loads your data and performs analysis and creates an output document (PDF, HTML).
RStudio is a great, free IDE that can be used to write RMarkdown documents.
I would like to be able to create a new variable based on specific values in two existing variables. My dataframe looks like:
structure(list(id = structure(c(1L, 2L, 3L, NA, NA, NA), .Label = c("blue",
"red", "yellow"), class = "factor"), value = c(-4.3, -2.5, -3.6,
NA, NA, NA)), .Names = c("id", "value"), row.names = c(NA, -6L
), class = "data.frame")
I would like to create a new column that contains only those values that pertain to blue (e.g., 4.2). All other values would result in NA, like so:
structure(list(id = structure(c(1L, 2L, 3L, NA, NA, NA), .Label = c("blue",
"red", "yellow"), class = "factor"), value = c(-4.3, -2.5, -3.6,
NA, NA, NA), newvalue = c(-4.3, NA, NA, NA, NA, NA)), .Names = c("id",
"value", "newvalue"), row.names = c(NA, -6L), class = "data.frame")
I tried the following:
b1 <- dat$id=="blue"
dat$newvalue <- dat$value[b1]
But that filled every cell in the new column with the same value (-4.3).
Due to presence of NA's it becomes tricky to assign values directly using indexing. We can use replace instead where we replace any non "blue" value to NA.
dat$newvalue <- replace(dat$value, dat$id != "blue", NA)
dat
# id value newvalue
#1 blue -4.3 -4.3
#2 red -2.5 NA
#3 yellow -3.6 NA
#4 <NA> NA NA
#5 <NA> NA NA
#6 <NA> NA NA
The equivalent ifelse statement would be :
dat$newvalue <- ifelse(dat$id != "blue", NA, dat$value)
I'm trying to build a grouped bar chart in R. I have pasted the dataframe below. I have been using plotly to build the chart. The problem is, the numbers on Y axis are not proper, as in they do not increase in ascending order. I've also posted an image of graph formed.
Can someone please point out, where I'm going wrong?
Dataframe
chart.supp.part.defect.matrix
Supplier PaintMarking45 Seal78 AirConditioning57 Engine34 CargoCompartment543 Insulation11
1 HJRU 8 <NA> <NA> 1 <NA> <NA>
2 DJDU <NA> 1 <NA> <NA> <NA> <NA>
3 DEF7 <NA> 3 54 <NA> <NA> <NA>
4 A23 <NA> <NA> <NA> 7 <NA> <NA>
5 A52 3 <NA> <NA> <NA> 2 <NA>
6 FJUE 65 <NA> 1 <NA> <NA> 11
7 A31 <NA> 1 5 <NA> <NA> <NA>
8 DJHD <NA> <NA> <NA> <NA> <NA> <NA>
9 A38 4 <NA> 22 <NA> <NA> <NA>
Code to build chart
title <- paste( "Supplier vs Defect")
p3 <- plot_ly(chart.supp.part.defect.matrix, x = ~Supplier, y = ~PaintMarking45, type = 'bar', name = 'Paint/Marking-45') %>%
add_trace(y = ~Seal78,name = 'Seal-78') %>%
add_trace(y = ~AirConditioning57,name = 'Air conditioning - 57') %>%
add_trace(y = ~Engine34,name = 'Engine-34') %>%
add_trace(y = ~CargoCompartment543,name = 'Cargo compartment-543') %>%
add_trace(y = ~Insulation11 ,name = 'Insulation -11') %>%
add_trace(y = ~Insulation6,name = 'Insulation-6') %>%
add_trace(y = ~Engine11,name = 'Engine-11') %>%
add_trace(y = ~Propulsion32,name = 'Propulsion-32') %>%
layout(yaxis = list(title = 'Defect Count'), barmode = 'group') %>%
layout(title = title)
ggplotly(p3)
Chart
Edit
dput(chart.supp.part.defect.matrix)
structure(list(Supplier = structure(c(9L, 6L, 5L, 1L, 4L, 8L,
2L, 7L, 3L), .Label = c(" A23", " A31", " A38", " A52", " DEF7",
"DJDU", "DJHD", "FJUE", "HJRU"), class = "factor"), PaintMarking45 = structure(c(4L,
NA, NA, NA, 1L, 3L, NA, NA, 2L), .Label = c("3", "4", "65", "8"
), class = "factor"), Seal78 = structure(c(NA, 1L, 2L, NA, NA,
NA, 1L, NA, NA), .Label = c("1", "3"), class = "factor"), AirConditioning57 = structure(c(NA,
NA, 4L, NA, NA, 1L, 3L, NA, 2L), .Label = c("1", "22", "5", "54"
), class = "factor"), Engine34 = structure(c(1L, NA, NA, 2L,
NA, NA, NA, NA, NA), .Label = c("1", "7"), class = "factor"),
CargoCompartment543 = structure(c(NA, NA, NA, NA, 1L, NA,
NA, NA, NA), .Label = "2", class = "factor"), Insulation11 = structure(c(NA,
NA, NA, NA, NA, 1L, NA, NA, NA), .Label = "11", class = "factor"),
Insulation6 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA,
NA), .Label = "7", class = "factor"), Engine11 = structure(c(NA,
NA, NA, NA, NA, NA, 2L, 1L, NA), .Label = c("54", "8"), class = "factor"),
Propulsion32 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
1L), .Label = "2", class = "factor")), .Names = c("Supplier",
"PaintMarking45", "Seal78", "AirConditioning57", "Engine34",
"CargoCompartment543", "Insulation11", "Insulation6", "Engine11",
"Propulsion32"), row.names = c(NA, -9L), class = "data.frame")
In addition to Adam Spannbauer's approach you can also force Plotly to interpret the data as numbers by setting the yaxis type to linear
layout(yaxis=list(type='linear'))
as #neilfws mentioned in a comment the issue is that your y access data is being built off of factors. You can attempt to fix this on your data read (as #neilfws mentioned) or coerce your data to numeric before plotting. Below is how you can do the latter.
chart.supp.part.defect.matrix[,2:10] <- lapply(chart.supp.part.defect.matrix[,2:10], as.numeric)
p3 <- plot_ly(chart.supp.part.defect.matrix, x = ~Supplier, y = ~PaintMarking45, type = 'bar', name = 'Paint/Marking-45') %>%
add_trace(y = ~Seal78,name = 'Seal-78') %>%
add_trace(y = ~AirConditioning57,name = 'Air conditioning - 57') %>%
add_trace(y = ~Engine34,name = 'Engine-34') %>%
add_trace(y = ~CargoCompartment543,name = 'Cargo compartment-543') %>%
add_trace(y = ~Insulation11 ,name = 'Insulation -11') %>%
add_trace(y = ~Insulation6,name = 'Insulation-6') %>%
add_trace(y = ~Engine11,name = 'Engine-11') %>%
add_trace(y = ~Propulsion32,name = 'Propulsion-32') %>%
layout(yaxis = list(title = 'Defect Count'), barmode = 'group') %>%
layout(title = title)
p3
Additionally, you don't need to call ggplotly in this case. That function is only needed when you want to build your plot using ggplot2 and then add plotly's interactivity to the ggplot object.
I have data that is organized like below M1 - M4, and I use the code from here to generate M_NEW:
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
NA NA 1 2 NA
It looks for a specified number of occurneces of number in the four columns and reports those numbers in M_NEW. Now, I would like to include the numbers 0 and 21 to each of the observations, unless that observation is NA. However, so far, I am unable to paste 0 and 21 to the observations, without also pasting them the NA values. The desired output is include in df below as M_NEW1. How can this be accomplished? It appears that I am missing something with paste here.
# sample data
df <- structure(list(M1 = structure(c(3L, 4L, 2L, 2L, 1L, 5L, NA, 6L
), .Label = c("0", "1", "1,2", "1,2,3,4", "1,2,3,4,5", "3,4,5,6,7"
), class = "factor"), M2 = structure(c(3L, NA, 2L, 2L, 1L, 4L,
NA, 5L), .Label = c("0", "1,2", "1,2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"
), class = "factor"), M3 = structure(c(3L, NA, 1L, 1L, 1L, 2L,
NA, 4L), .Label = c("0", "1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7,8"
), class = "factor"), M4 = structure(c(3L, NA, 1L, 2L, 1L, 5L,
NA, 4L), .Label = c("0", "1", "1,2,3,4,5,6", "1,2,3,4,5,6,7,8,9,10,11,12",
"4,5"), class = "factor"), M_NEW1 = structure(c(3L, NA, 1L, 2L,
1L, 5L, NA, 4L), .Label = c("0,21", "1,0,21", "1,2,3,4,5,0,21",
"3,4,5,6,7,8,0,21", "4,5,0,21"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW1"), class = "data.frame", row.names = c(NA,
-8L))
# function slightly modified from https://stackoverflow.com/a/23203159/1670053
f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
#return(ifelse(is.na(res), ifelse(res == 0, "0,21", NA), paste(res,",0,21",sep=""))) #https://stackoverflow.com/a/17554670/1670053
#return(ifelse(is.na(res), NA, ifelse(res == 0, "0,21", paste(na.omit(res),",0,21",sep=""))))
#return(ifelse(is.na(res), as.character(NA), ifelse(res == 0, "0,21", paste(res,",0,21",sep=""))))
}
df$M_NEW2 <- apply(df[, 1:4], 1, f))
You can add another if else statement - rather inelegant but gets you there.
f2 <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
res <- ifelse(res %in% c("0", ""), "0,21", res)
if(res %in% c("NA","0,21")) res else paste(res, "0,21", sep=",")
}
apply(df[1:4], 1, f2)
# "1,2,3,4,5,0,21" "NA" "0,21" "1,0,21" "0,21" "4,5,0,21" "NA"
# "3,4,5,6,7,8,0,21"