I have two dataframes and I want to replace all values ( in all the columns) of df1 using the equivalent value in df2 (df2$value).
df1
structure(list(Cell_ID = c(7L, 2L, 3L, 10L), n_1 = c(0L, 0L,
0L, 0L), n_2 = c(9L, 1L, 4L, 1L), n_3 = c(10L, 4L, 5L, 2L), n_4 = c(NA,
5L, NA, 4L), n_5 = c(NA, 7L, NA, 6L), n_6 = c(NA, 9L, NA, 8L),
n_7 = c(NA, 10L, NA, 3L)), class = "data.frame", row.names = c(NA,
-4L))
df2
structure(list(Cell_ID = 0:10, value = c(5L, 100L, 200L, 300L,
400L, 500L, 600L, 700L, 800L, 900L, 1000L)), class = "data.frame", row.names = c(NA,
-11L))
The desired output would look like this:
So far I tried this as suggested in another similar post but its not doing it well (randomly missing some points)
key= df2$Cell_ID
value = df2$value
lapply(1:8,FUN = function(i){df1[df1 == key[i]] <<- value[i]})
Note that the numbers have been just multiplied by 10 for ease in the example the real data has numbers are all over the place so just multiplying the dataframe by 10 won't work.
An option is match the elements with the 'Cell_ID' of second dataset and use that as index to return the corresponding 'value' from 'df2'
library(dplyr)
df1 %>%
mutate(across(everything(), ~ df2$value[match(., df2$Cell_ID)]))
-output
# Cell_ID n_1 n_2 n_3 n_4 n_5 n_6 n_7
#1 700 5 900 1000 NA NA NA NA
#2 200 5 100 400 500 700 900 1000
#3 300 5 400 500 NA NA NA NA
#4 1000 5 100 200 400 600 800 300
Or another option is to use a named vector to do the match
library(tibble)
df1 %>%
mutate(across(everything(), ~ deframe(df2)[as.character(.)]))
The base R equivalent is
df1[] <- lapply(df1, function(x) df2$value[match(x, df2$Cell_ID)])
I Am trying to automate the below R code in which I am calculating the p values. The data is in csv format.
I have clicks and open number for each section and their version.
If someone can help with applying any loop or something.
Data I have in .csv format:
` Total Clicks
Section Version A Version B Version C Version D
Section1 1,999 2,116 2,307 2,568
Section2 3,450 1,781 3,416 1,399
Section3 1,773 915 1,744 644
Section4 0 2,255 0 1,432
Section5 588 573 721 235
Main email 7,222 7,067 7,467 6,043
Total email 7,810 7,640 8,188 6,278
`
`Version # Opens
A 9,073
B 9,150
C 9,215
D 9,153
`
Currently I am assigning the data manually in the below format:
` S1_Click_A=1,999 ####(section 1, email A)
S1_Click_B=2,116 ## (section 1, email B)
S1_Click_C=2,307
S1_Click_D=2,568
S2_Click_A=3,450
S2_Click_B=1,781
.
.
.
S5_Click_C=721
S5_Click_D=235
MainBody_Click_A=7,222
MainBody_Click_B=7,067
.
.
TotalEmail_Click_C=8,188
TotalEmail_Click_D=6,278
`
`# to test % total click is the comparable across versions`
`# section 1 test
S1ab <- prop.test(x = c(S1_Click_A,S1_Click_B), n = c(Open_A,Open_B))
S1ac <- prop.test(x = c(S1_Click_A,S1_Click_C), n = c(Open_A,Open_C))
S1ad <- prop.test(x = c(S1_Click_A,S1_Click_D), n = c(Open_A,Open_D))
S1bc <- prop.test(x = c(S1_Click_B,S1_Click_C), n = c(Open_B,Open_C))
S1bd <- prop.test(x = c(S1_Click_B,S1_Click_D), n = c(Open_B,Open_D))
S1cd <- prop.test(x = c(S1_Click_C,S1_Click_D), n = c(Open_C,Open_D))
`
`#section 2 test
S2ab <- prop.test(x = c(S2_Click_A,S2_Click_B), n = c(Open_A,Open_B))
S2ac <- prop.test(x = c(S2_Click_A,S2_Click_C), n = c(Open_A,Open_C))
```
S2cd <- prop.test(x = c(S2_Click_C,S2_Click_D), n = c(Open_C,Open_D))
`
`#section 3 test
S3ab <- prop.test(x = c(S3_Click_A,S3_Click_B), n = c(Open_A,Open_B))
S3ac <- prop.test(x = c(S3_Click_A,S3_Click_C), n = c(Open_A,Open_C))
```
S3cd <- prop.test(x = c(S3_Click_C,S3_Click_D), n = c(Open_C,Open_D))`
`#section 4 test
S4ab <- prop.test(x = c(S4_Click_A,S4_Click_B), n = c(Open_A,Open_B))
S4ac <- prop.test(x = c(S4_Click_A,S4_Click_C), n = c(Open_A,Open_C))
`#section 5 test
S5ab <- prop.test(x = c(S5_Click_A,S5_Click_B), n = c(Open_A,Open_B))
S5ac <- prop.test(x = c(S5_Click_A,S5_Click_C), n = c(Open_A,Open_C))
`#Main body test
MainBodyab <- prop.test(x = c(MainBody_Click_A,MainBody_Click_B), n = c(Open_A,Open_B))
MainBodyac <- prop.test(x = c(MainBody_Click_A,MainBody_Click_C), n = c(Open_A,Open_C))
`
```
`
`#FINAL P VALUE`
`S1ab$p.value
S1ac$p.value
S1ad$p.value
`
I expect:
1. I want to read the data in the above format. I mean reading the
section 1 version A data i.e 1,999 and assigning the same to
S1_Click_A=1,999 similarly for others.
2. a matrix with their clicks and p values in a single row.
dput()
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
dput for final format
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L,
7L), .Label = c("Main email body", "Section 1", "Section 2",
"Section 3", "Section 4", "Section 5", "Total email"), class = "factor"),
Version.A = structure(c(3L, 4L, 2L, 1L, 5L, 6L, 7L), .Label = c("0",
"1,773", "1,999", "3,450", "588", "7,222", "7,810"), class = "factor"),
Version.B = structure(c(2L, 1L, 7L, 3L, 4L, 5L, 6L), .Label = c("1,781",
"2,116", "2,255", "573", "7,067", "7,640", "915"), class = "factor"),
Version.C = structure(c(3L, 4L, 2L, 1L, 6L, 5L, 7L), .Label = c("0",
"1,744", "2,307", "3,416", "7,467", "721", "8,188"), class = "factor"),
Version.D = structure(c(3L, 1L, 7L, 2L, 4L, 5L, 6L), .Label = c("1,399",
"1,432", "2,568", "235", "6,043", "6,278", "644"), class = "factor"),
A.vs..B = c(NA, NA, NA, NA, NA, NA, NA), A.vs..C = c(NA,
NA, NA, NA, NA, NA, NA), A.vs..D = c(NA, NA, NA, NA, NA,
NA, NA), B.vs..C = c(NA, NA, NA, NA, NA, NA, NA), B.vs..D = c(NA,
NA, NA, NA, NA, NA, NA), C.vs..D = c(NA, NA, NA, NA, NA,
NA, NA)), class = "data.frame", row.names = c(NA, -7L))
Here is a solution for first section, same principle for the others.
First generate the combinations then apply the test on them.
df <- structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
opens <- data.frame(A = 9073, B = 9150, C = 9215, D = 9153)
s1_comb <- combn(colnames(df)[-1], 2)
open_comb <- combn(colnames(opens), 2)
res <- cbind(df,
matrix(NA,
nrow = nrow(df),
ncol = ncol(open_comb),
dimnames = list(1:nrow(df), paste(open_comb[1,], "vs", open_comb[2,]))
)
)
for(k in 1:ncol(s1_comb)){
for(o in 1:(nrow(df)-2)){
res[o, paste(open_comb[1, k], "vs", open_comb[2, k])] <- prop.test(x = unlist(df[o, s1_comb[,k]]),
n = unlist(opens[open_comb[,k]]))$p.value
}
}
res
Final output, as requested :
> res
Section Version.A Version.B Version.C Version.D A vs B A vs C A vs D B vs C B vs D
1 Section 1 2967 3353 495 559 2.452892e-08 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
2 Section 2 4840 4522 285 266 1.259231e-07 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
3 Section 3 2508 2250 228 205 2.961113e-06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
4 Section 4 2093 1333 209 133 1.081110e-48 0.000000e+00 0.000000e+00 4.567813e-198 6.505394e-234
5 Section 5 1117 925 186 154 2.756287e-06 7.420214e-161 3.232226e-174 1.051130e-116 4.618885e-129
6 Main email body 12408 11458 282 260 NA NA NA NA NA
7 Total email 13525 12383 271 248 NA NA NA NA NA
C vs D
1 3.472031e-02
2 4.850847e-01
3 3.178608e-01
4 5.557843e-05
5 1.022220e-01
6 NA
7 NA
You should have a look into RMarkdown, which can be used to create fully reproducible reports.
You basically write a script, the script loads your data and performs analysis and creates an output document (PDF, HTML).
RStudio is a great, free IDE that can be used to write RMarkdown documents.
I am currently experiencing a problem where I have a long dataframe (i.e., multiple rows per subject) and want to remove cases that don't have any measurements (in any of the rows) on one variable. I've tried transforming the data to wide format, but this was a problem as I can't go back anymore (going from long to wide "destroys" my timeline variable). Does anyone have an idea about how to fix this problem?
Below is some code to simulate the head of my data. Specifically, I want to remove cases that don't have a measurement of extraversion on any of the measurement occasions ("time").
structure(list(id = c(1L, 1L, 2L, 3L, 3L, 3L), time = c(79L, 95L, 79L, 28L, 40L, 52L),
extraversion = c(3.2, NA, NA, 2, 2.4, NA), satisfaction = c(3L, 3L, 4L, 5L, 5L, 9L),
`self-esteem` = c(4.9, NA, NA, 6.9, 6.7, NA)), .Names = c("id", "time", "extraversion",
"satisfaction", "self-esteem"), row.names = c(NA, 6L), class = "data.frame")
Note: I realise the missing of my extraversion variable coincides with my self-esteem variable.
To drop an entire id if they don't have any measurements for extraversion you could do:
library(data.table)
setDT(df)[, drop := all(is.na(extraversion)) ,by= id][!df$drop]
# id time extraversion satisfaction self-esteem drop
#1: 1 79 3.2 3 4.9 FALSE
#2: 1 95 NA 3 NA FALSE
#3: 3 28 2.0 5 6.9 FALSE
#4: 3 40 2.4 5 6.7 FALSE
#5: 3 52 NA 9 NA FALSE
Or you could use .I which I believe should be faster:
setDT(df)[df[,.I[!all(is.na(extraversion))], by = id]$V1]
Lastly, a base R solution could use ave (thanks to #thelatemail for the suggestion to make it shorter/more expressive):
df[!ave(is.na(df$extraversion), df$id, FUN = all),]
Assuming the data frame is named mydata, use a dplyr filter:
library(dplyr)
mydata %>%
group_by(id) %>%
filter(!all(is.na(extraversion))) %>%
ungroup()
d <-
structure(
list(
id = c(1L, 1L, 2L, 3L, 3L, 3L),
time = c(79L, 95L, 79L, 28L, 40L, 52L),
extraversion = c(3.2, NA, NA, 2, 2.4, NA),
satisfaction = c(3L, 3L, 4L, 5L, 5L, 9L),
`self-esteem` = c(4.9, NA, NA, 6.9, 6.7, NA)
),
.Names = c("id", "time", "extraversion",
"satisfaction", "self-esteem"),
row.names = c(NA, 6L),
class = "data.frame"
)
d[complete.cases(d$extraversion), ]
d[is.na(d$extraversion), ]
complete.cases is great if you wanted to remove any rows with missing data: complete.cases(d)
I have a question concerning the analysis of some bioinformatics data in R.
My test data frame consists of a variable "sequence" with different letter codes as observations and three different variables representing individuals/samples (P1, P2, P3) that say how often the particular observation was counted in an individual (so P3 contains the sequence "AB" 23 times for example).
I want to create a new column now (already indicated in my data frame as dummy column X with NA) that shows for each sequence row if the sequence is overall shared between individuals (P1, P2, P3) and more importantly how many of the three individuals share it. The numbers in the new column can therefore range only from 1 to 3. For example: for sequence "ABCDE" the new column would show value 1 because it occurs only in one individual P3, for sequence "ABC" the new column would show value 2 because it occurs in both individuals P2 and P3 and finally for "ABCD" it would show 3 since all individuals contain the sequence.
My test data looks like this after dput():
structure(list(Sequence = structure(1:9, .Label = c("AB", "ABC",
"ABCD", "ABCDE", "ABCDEF", "ABCDEFG", "ABCDEFGH", "ABCDEFGHI",
"ABCDEFGHIJ"), class = "factor"), P1 = c(5L, 0L, 20L, 0L, 3L,
1L, 0L, 0L, 0L), P2 = c(6L, 2L, 3L, 0L, 2L, 0L, 56L, 10L, 3L),
P3 = c(23L, 34L, 8L, 5L, 0L, 6L, 0L, 78L, 5L), X = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Sequence",
"P1", "P2", "P3", "X"), class = "data.frame", row.names = c(NA,
-9L))
Thank you!
You can try to sum the "P." columns with a positive count:
mydf$X <- rowSums(mydf[, grep("^P", names(mydf))]>0)
head(mydf, 4)
# Sequence P1 P2 P3 X
#1 AB 5 6 23 3
#2 ABC 0 2 34 2
#3 ABCD 20 3 8 3
#4 ABCDE 0 0 5 1
We can use Reduce with lapply
df1$X <- Reduce(`+`, lapply(df1[2:4], `>`, 0))
df1$X
#[1] 3 2 3 1 2 2 1 2 2
Reduce can be very efficient as showed in the benchmarks here
I want to use the aggregation function of R to aggregate a Price on several fields. However, I also have NAs in my data, which I would like to keep.
Tried:
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 3L, 2L, 1L), REFERENCE = c("TEST1", "TEST2", "TEST3",
"TEST4", "TEST1", "TEST2", "TEST3", "TEST4", "TEST1", "TEST2",
"TEST3", "TEST4", "TEST1", "TEST2", "", "TEST2"), ISS = c(1234L,
1234L, 1111L, 1111L, 1234L, 1111L, 1234L, 1111L, 1234L, NA, 1234L,
1111L, 1234L, 1111L, 1234L, NA), Price = c(10L, NA, 20L, NA,
10L, 12L, NA, 99L, 100L, NA, 100L, 12L, NA, 11L, 0L, 12L)), .Names = c("ID",
"REFERENCE", "ISS", "Price"), row.names = c(NA, -16L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000100788>)
>
> df <- aggregate(df$Price, by=list(ID=df$ID, REFERENCE=df$REFERENCE, ISS=df$ISS), FUN=sum)
Setting na.action = na.pass, gives me:7
Error in aggregate.data.frame(as.data.frame(x), ...) :
no rows to aggregate
As a result I would like to have:
Hence, I would like to keep my NA Data in my df.
Any recommendation how to implement that?
I appreciate your replies!
Instead of using aggregate on a "data.table", we can use the data.table methods. We get the sum of Price (sum(Price, na.rm=TRUE)) after grouping by "ID/REFERENCE/ISS" (by=list(ID, REFERENCE, ISS)]. Order the output by "ID", "REFERENCE" (if needed)
library(data.table)
df[, sum(Price, na.rm=TRUE), by = list(ID, REFERENCE, ISS)][
order(ID, REFERENCE)]
# ID REFERENCE ISS V1
#1: 1 TEST1 1234 10
#2: 1 TEST2 1111 12
#3: 1 TEST2 NA 12
#4: 2 1234 0
#5: 2 TEST2 1234 0
#6: 2 TEST3 1234 100
#7: 3 TEST2 1111 11
#8: 3 TEST3 1111 20
#9: 3 TEST4 1111 111
#10: 4 TEST1 1234 110
#11: 4 TEST4 1111 0