This is a followup question to:
R- merge two data frames but some values have semi colon in them
which has been addressed by contributor: agstudy.
The actual data discussed in the link is a bit more complex and i have been stuck for a while.
This is what my dataframe (df2) looks like:
myIDColumn someName somevalue
AB gsdfg 123
CD tfgsdfg 234
EF sfdgsf 365
GH gdfgb 53453
IJ sr 64564
KL sfsdv 4234234
MN ewrwe 5
OP dsfsss 3453
QR gggg 667
ST dss 7567
UV hhhhjf 55
WX dfadasad 8657
YZ ghfgh 1234
ABC gdgfg 234455
VCB hgjkk 5555667
This is what my df1 looks like:
ID someText someThing
AB ada 12
CD;EF;QR dfsdf 13
IJ fgfgd 14
KL fgdg 15
MN gh 16
OP;WX jhjhj 17
WW ghjgjhgjghj 18
YZ kkl 19
This is what i was hoping to get as an output:
I can merge the two well by using:
mm <- merge(df2,df1,by.y='ID',by.x='myIDColumn',all.y=TRUE)
but after that no idea how to proceed further.
Any help is really appreciated. Thanks.
df1:
structure(list(ID = structure(1:8, .Label = c("AB", "CD;EF;QR",
"IJ", "KL", "MN", "OP;WX", "WW", "YZ"), class = "factor"), someText = structure(c(1L,
2L, 4L, 3L, 5L, 7L, 6L, 8L), .Label = c("ada", "dfsdf", "fgdg",
"fgfgd", "gh", "ghjgjhgjghj", "jhjhj", "kkl"), class = "factor"),
someThing = 12:19), .Names = c("ID", "someText", "someThing"
), class = "data.frame", row.names = c(NA, -8L))
df2:
structure(list(myIDColumn = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 14L, 15L, 2L, 13L), .Label = c("AB", "ABC",
"CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "VCB",
"WX", "YZ"), class = "factor"), someName = structure(c(9L, 15L,
12L, 5L, 14L, 13L, 4L, 2L, 7L, 3L, 11L, 1L, 8L, 6L, 10L), .Label = c("dfadasad",
"dsfsss", "dss", "ewrwe", "gdfgb", "gdgfg", "gggg", "ghfgh",
"gsdfg", "hgjkk", "hhhhjf", "sfdgsf", "sfsdv", "sr", "tfgsdfg"
), class = "factor"), somevalue = c(123L, 234L, 365L, 53453L,
64564L, 4234234L, 5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L,
234455L, 5555667L)), .Names = c("myIDColumn", "someName", "somevalue"
), class = "data.frame", row.names = c(NA, -15L))
There are probably better ways to do it but you could create a temporary dataframe:
df1 <- structure(list(ID = c("AB", "CD;EF;QR", "IJ", "KL", "MN", "OP;WX",
"WW", "YZ"), someText = c("ada", "dfsdf", "fgfgd", "fgdg", "gh",
"jhjhj", "ghjgjhgjghj", "kkl"), someThing = 12:19), .Names = c("ID",
"someText", "someThing"), class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(myIDColumn = c("AB", "CD", "EF", "GH", "IJ", "KL",
"MN", "OP", "QR", "ST", "UV", "WX", "YZ", "ABC", "VCB"), someName = c("gsdfg",
"tfgsdfg", "sfdgsf", "gdfgb", "sr", "sfsdv", "ewrwe", "dsfsss",
"gggg", "dss", "hhhhjf", "dfadasad", "ghfgh", "gdgfg", "hgjkk"
), somevalue = c(123L, 234L, 365L, 53453L, 64564L, 4234234L,
5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L, 234455L, 5555667L)), .Names = c("myIDColumn",
"someName", "somevalue"), class = "data.frame", row.names = c(NA,
-15L))
f <- function(x) {
y <- unlist(strsplit(x$ID,';'))
data.frame(ID = x$ID, someText = x$someText, someThing = x$someThing, ID1 = y)
}
library(plyr)
df3 <- ddply(df1, .(ID), f)
> df3
ID someText someThing ID1
1 AB ada 12 AB
2 CD;EF;QR dfsdf 13 CD
3 CD;EF;QR dfsdf 13 EF
4 CD;EF;QR dfsdf 13 QR
5 IJ fgfgd 14 IJ
6 KL fgdg 15 KL
7 MN gh 16 MN
8 OP;WX jhjhj 17 OP
9 OP;WX jhjhj 17 WX
10 WW ghjgjhgjghj 18 WW
11 YZ kkl 19 YZ
You could merge this with your dataframe df2 and summarize the data:
mm <- merge(df2,df3,by.y='ID1',by.x='myIDColumn',all.y=TRUE)
ddply(mm, .(ID,someText, someThing), summarize,
somevalue = paste(somevalue, collapse=','),
someName = paste(someName, collapse = ","))
ID someText someThing somevalue someName
1 AB ada 12 123 gsdfg
2 CD;EF;QR dfsdf 13 234,365,667 tfgsdfg,sfdgsf,gggg
3 IJ fgfgd 14 64564 sr
4 KL fgdg 15 4234234 sfsdv
5 MN gh 16 5 ewrwe
6 OP;WX jhjhj 17 3453,8657 dsfsss,dfadasad
7 WW ghjgjhgjghj 18 NA NA
8 YZ kkl 19 1234 ghfgh
Related
I want to add a column days to a dataset with some conditions. For each soil there should be nine rows in the days column. The first two rows (0 and 4) should be the value from the SS Period. The value for the days 10-66 should be the N in Period and the ES in Period should be the last days.
This is a very bad explanation I know, but I think perhaps it makes sense by looking at the expected_df dataset.
All help is very much appreciated!
df <- structure(list(soil = c(12L, 5L, 3L, 12L, 5L, 3L, 12L, 3L, 5L
), ITS_1 = c(290900, 16090, 12460, 0, 19700, 25000, 114.2, 39100,
25090), Period = c("ES", "ES", "ES", "N", "N", "N", "SS", "SS",
"SS")), row.names = c(NA, -9L), class = "data.frame")**strong text**
This is how the data should look like
expected_df <- structure(list(soil = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 12L, 12L, 12L, 12L, 12L, 12L,
5L, 5L, 5L), ITS_1 = c(39100, 39100, 25000, 25000, 25000, 25000,
12460, 12460, 12460, 25090, 25090, 19700, 19700, 19700, 19700,
16090, 16090, 16090, 114.2, 114.2, 0, 0, 0, 0, 16090, 16090,
16090), Period = c("SS", "SS", "N", "N", "N", "N", "ES", "ES",
"ES", "SS", "SS", "N", "N", "N", "N", "ES", "ES", "ES", "SS",
"SS", "N", "N", "N", "N", "ES", "ES", "ES"), days = c(0L, 4L,
10L, 17L, 24L, 66L, 81L, 94L, 116L, 0L, 4L, 10L, 17L, 24L, 66L,
81L, 94L, 116L, 0L, 4L, 10L, 17L, 24L, 66L, 81L, 94L, 116L)), class = "data.frame", row.names = c(NA,
-27L))
One solution is to create a dataframe and left_join().
library(dplyr)
df_join <- data.frame(days = c(0, 4, 10, 17, 24, 66, 81, 94, 116),
Period = rep(c("SS", "N", "ES"), times = c(2, 4, 3)))
df %>%
left_join(df_join, by = "Period")
# soil ITS_1 Period days
# 1 12 290900.0 ES 81
# 2 12 290900.0 ES 94
# 3 12 290900.0 ES 116
# 4 5 16090.0 ES 81
# 5 5 16090.0 ES 94
# 6 5 16090.0 ES 116
# 7 3 12460.0 ES 81
# 8 3 12460.0 ES 94
# 9 3 12460.0 ES 116
# 10 12 0.0 N 10
# 11 12 0.0 N 17
# 12 12 0.0 N 24
# 13 12 0.0 N 66
# 14 5 19700.0 N 10
# 15 5 19700.0 N 17
# 16 5 19700.0 N 24
# 17 5 19700.0 N 66
# 18 3 25000.0 N 10
# 19 3 25000.0 N 17
# 20 3 25000.0 N 24
# 21 3 25000.0 N 66
# 22 12 114.2 SS 0
# 23 12 114.2 SS 4
# 24 3 39100.0 SS 0
# 25 3 39100.0 SS 4
# 26 5 25090.0 SS 0
# 27 5 25090.0 SS 4
I have two dataframes and I want to replace all values ( in all the columns) of df1 using the equivalent value in df2 (df2$value).
df1
structure(list(Cell_ID = c(7L, 2L, 3L, 10L), n_1 = c(0L, 0L,
0L, 0L), n_2 = c(9L, 1L, 4L, 1L), n_3 = c(10L, 4L, 5L, 2L), n_4 = c(NA,
5L, NA, 4L), n_5 = c(NA, 7L, NA, 6L), n_6 = c(NA, 9L, NA, 8L),
n_7 = c(NA, 10L, NA, 3L)), class = "data.frame", row.names = c(NA,
-4L))
df2
structure(list(Cell_ID = 0:10, value = c(5L, 100L, 200L, 300L,
400L, 500L, 600L, 700L, 800L, 900L, 1000L)), class = "data.frame", row.names = c(NA,
-11L))
The desired output would look like this:
So far I tried this as suggested in another similar post but its not doing it well (randomly missing some points)
key= df2$Cell_ID
value = df2$value
lapply(1:8,FUN = function(i){df1[df1 == key[i]] <<- value[i]})
Note that the numbers have been just multiplied by 10 for ease in the example the real data has numbers are all over the place so just multiplying the dataframe by 10 won't work.
An option is match the elements with the 'Cell_ID' of second dataset and use that as index to return the corresponding 'value' from 'df2'
library(dplyr)
df1 %>%
mutate(across(everything(), ~ df2$value[match(., df2$Cell_ID)]))
-output
# Cell_ID n_1 n_2 n_3 n_4 n_5 n_6 n_7
#1 700 5 900 1000 NA NA NA NA
#2 200 5 100 400 500 700 900 1000
#3 300 5 400 500 NA NA NA NA
#4 1000 5 100 200 400 600 800 300
Or another option is to use a named vector to do the match
library(tibble)
df1 %>%
mutate(across(everything(), ~ deframe(df2)[as.character(.)]))
The base R equivalent is
df1[] <- lapply(df1, function(x) df2$value[match(x, df2$Cell_ID)])
I Am trying to automate the below R code in which I am calculating the p values. The data is in csv format.
I have clicks and open number for each section and their version.
If someone can help with applying any loop or something.
Data I have in .csv format:
` Total Clicks
Section Version A Version B Version C Version D
Section1 1,999 2,116 2,307 2,568
Section2 3,450 1,781 3,416 1,399
Section3 1,773 915 1,744 644
Section4 0 2,255 0 1,432
Section5 588 573 721 235
Main email 7,222 7,067 7,467 6,043
Total email 7,810 7,640 8,188 6,278
`
`Version # Opens
A 9,073
B 9,150
C 9,215
D 9,153
`
Currently I am assigning the data manually in the below format:
` S1_Click_A=1,999 ####(section 1, email A)
S1_Click_B=2,116 ## (section 1, email B)
S1_Click_C=2,307
S1_Click_D=2,568
S2_Click_A=3,450
S2_Click_B=1,781
.
.
.
S5_Click_C=721
S5_Click_D=235
MainBody_Click_A=7,222
MainBody_Click_B=7,067
.
.
TotalEmail_Click_C=8,188
TotalEmail_Click_D=6,278
`
`# to test % total click is the comparable across versions`
`# section 1 test
S1ab <- prop.test(x = c(S1_Click_A,S1_Click_B), n = c(Open_A,Open_B))
S1ac <- prop.test(x = c(S1_Click_A,S1_Click_C), n = c(Open_A,Open_C))
S1ad <- prop.test(x = c(S1_Click_A,S1_Click_D), n = c(Open_A,Open_D))
S1bc <- prop.test(x = c(S1_Click_B,S1_Click_C), n = c(Open_B,Open_C))
S1bd <- prop.test(x = c(S1_Click_B,S1_Click_D), n = c(Open_B,Open_D))
S1cd <- prop.test(x = c(S1_Click_C,S1_Click_D), n = c(Open_C,Open_D))
`
`#section 2 test
S2ab <- prop.test(x = c(S2_Click_A,S2_Click_B), n = c(Open_A,Open_B))
S2ac <- prop.test(x = c(S2_Click_A,S2_Click_C), n = c(Open_A,Open_C))
```
S2cd <- prop.test(x = c(S2_Click_C,S2_Click_D), n = c(Open_C,Open_D))
`
`#section 3 test
S3ab <- prop.test(x = c(S3_Click_A,S3_Click_B), n = c(Open_A,Open_B))
S3ac <- prop.test(x = c(S3_Click_A,S3_Click_C), n = c(Open_A,Open_C))
```
S3cd <- prop.test(x = c(S3_Click_C,S3_Click_D), n = c(Open_C,Open_D))`
`#section 4 test
S4ab <- prop.test(x = c(S4_Click_A,S4_Click_B), n = c(Open_A,Open_B))
S4ac <- prop.test(x = c(S4_Click_A,S4_Click_C), n = c(Open_A,Open_C))
`#section 5 test
S5ab <- prop.test(x = c(S5_Click_A,S5_Click_B), n = c(Open_A,Open_B))
S5ac <- prop.test(x = c(S5_Click_A,S5_Click_C), n = c(Open_A,Open_C))
`#Main body test
MainBodyab <- prop.test(x = c(MainBody_Click_A,MainBody_Click_B), n = c(Open_A,Open_B))
MainBodyac <- prop.test(x = c(MainBody_Click_A,MainBody_Click_C), n = c(Open_A,Open_C))
`
```
`
`#FINAL P VALUE`
`S1ab$p.value
S1ac$p.value
S1ad$p.value
`
I expect:
1. I want to read the data in the above format. I mean reading the
section 1 version A data i.e 1,999 and assigning the same to
S1_Click_A=1,999 similarly for others.
2. a matrix with their clicks and p values in a single row.
dput()
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
dput for final format
structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L,
7L), .Label = c("Main email body", "Section 1", "Section 2",
"Section 3", "Section 4", "Section 5", "Total email"), class = "factor"),
Version.A = structure(c(3L, 4L, 2L, 1L, 5L, 6L, 7L), .Label = c("0",
"1,773", "1,999", "3,450", "588", "7,222", "7,810"), class = "factor"),
Version.B = structure(c(2L, 1L, 7L, 3L, 4L, 5L, 6L), .Label = c("1,781",
"2,116", "2,255", "573", "7,067", "7,640", "915"), class = "factor"),
Version.C = structure(c(3L, 4L, 2L, 1L, 6L, 5L, 7L), .Label = c("0",
"1,744", "2,307", "3,416", "7,467", "721", "8,188"), class = "factor"),
Version.D = structure(c(3L, 1L, 7L, 2L, 4L, 5L, 6L), .Label = c("1,399",
"1,432", "2,568", "235", "6,043", "6,278", "644"), class = "factor"),
A.vs..B = c(NA, NA, NA, NA, NA, NA, NA), A.vs..C = c(NA,
NA, NA, NA, NA, NA, NA), A.vs..D = c(NA, NA, NA, NA, NA,
NA, NA), B.vs..C = c(NA, NA, NA, NA, NA, NA, NA), B.vs..D = c(NA,
NA, NA, NA, NA, NA, NA), C.vs..D = c(NA, NA, NA, NA, NA,
NA, NA)), class = "data.frame", row.names = c(NA, -7L))
Here is a solution for first section, same principle for the others.
First generate the combinations then apply the test on them.
df <- structure(list(Section = structure(c(2L, 3L, 4L, 5L, 6L, 1L, 7L), .Label =
c("Main email body", "Section 1", "Section 2", "Section 3", "Section 4",
"Section 5", "Total email"), class = "factor"), Version.A = c(2967L, 4840L,
2508L, 2093L, 1117L, 12408L, 13525L), Version.B = c(3353L, 4522L, 2250L,
1333L, 925L, 11458L, 12383L), Version.C = c(495L, 285L, 228L, 209L, 186L,
282L, 271L), Version.D = c(559L, 266L, 205L, 133L, 154L, 260L, 248L)), class
= "data.frame", row.names = c(NA, -7L ))
opens <- data.frame(A = 9073, B = 9150, C = 9215, D = 9153)
s1_comb <- combn(colnames(df)[-1], 2)
open_comb <- combn(colnames(opens), 2)
res <- cbind(df,
matrix(NA,
nrow = nrow(df),
ncol = ncol(open_comb),
dimnames = list(1:nrow(df), paste(open_comb[1,], "vs", open_comb[2,]))
)
)
for(k in 1:ncol(s1_comb)){
for(o in 1:(nrow(df)-2)){
res[o, paste(open_comb[1, k], "vs", open_comb[2, k])] <- prop.test(x = unlist(df[o, s1_comb[,k]]),
n = unlist(opens[open_comb[,k]]))$p.value
}
}
res
Final output, as requested :
> res
Section Version.A Version.B Version.C Version.D A vs B A vs C A vs D B vs C B vs D
1 Section 1 2967 3353 495 559 2.452892e-08 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
2 Section 2 4840 4522 285 266 1.259231e-07 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
3 Section 3 2508 2250 228 205 2.961113e-06 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
4 Section 4 2093 1333 209 133 1.081110e-48 0.000000e+00 0.000000e+00 4.567813e-198 6.505394e-234
5 Section 5 1117 925 186 154 2.756287e-06 7.420214e-161 3.232226e-174 1.051130e-116 4.618885e-129
6 Main email body 12408 11458 282 260 NA NA NA NA NA
7 Total email 13525 12383 271 248 NA NA NA NA NA
C vs D
1 3.472031e-02
2 4.850847e-01
3 3.178608e-01
4 5.557843e-05
5 1.022220e-01
6 NA
7 NA
You should have a look into RMarkdown, which can be used to create fully reproducible reports.
You basically write a script, the script loads your data and performs analysis and creates an output document (PDF, HTML).
RStudio is a great, free IDE that can be used to write RMarkdown documents.
I have a data that looks something like this:
structure(list(ID = structure(c(1L, 2L, 2L, 3L, 4L, 5L, 6L, 6L,
6L), .Label = c("a", "b", "c", "d", "e", "f"), class = "factor"),
Value = c(10L, 13L, 12L, 43L, 23L, 66L, 78L, 42L, 19L)), .Names = c("ID",
"Value"), class = "data.frame", row.names = c(NA, -9L))
I would like to divide this dataset into multiple datasets on the basis of the ID values, i.e. one dataset that contains only ID = a, another that contains only ID = b, and so on.
How do I do this subsetting automatically in R? I understand that if the number of values in ID is less, we could just do it manually, but in case there are a lot of values under ID, there has to be a smarter way of doing this.
You can use the split function.
df <- structure(list(ID = structure(c(1L, 2L, 2L, 3L, 4L, 5L, 6L, 6L,
6L), .Label = c("a", "b", "c", "d", "e", "f"), class = "factor"),
Value = c(10L, 13L, 12L, 43L, 23L, 66L, 78L, 42L, 19L)), .Names = c("ID",
"Value"), class = "data.frame", row.names = c(NA, -9L))
> df
ID Value
1 a 10
2 b 13
3 b 12
4 c 43
5 d 23
6 e 66
7 f 78
8 f 42
9 f 19
listed_df <- split(df, df$ID)
> listed_df
$a
ID Value
1 a 10
$b
ID Value
2 b 13
3 b 12
$c
ID Value
4 c 43
$d
ID Value
5 d 23
$e
ID Value
6 e 66
$f
ID Value
7 f 78
8 f 42
9 f 19
To call on one of these just use index it with $.
sum(listed_df$f$Value)
You can also lapply a function across each of the dataframes within the list. If you wanted to sum up each Value or something you could do..
lapply(df_list, function(x) sum(x$Value))
You can also do this just by grouping the original dataframe by ID and then perform summarise operations on it from there.
This should be pretty easy.
exampleb <- subset(df, ID == 'b')
exampleb
ID Value
2 b 13
3 b 12
Also, take a look at these links.
https://www.r-bloggers.com/5-ways-to-subset-a-data-frame-in-r/
https://www.statmethods.net/management/subset.html
I want to use the aggregation function of R to aggregate a Price on several fields. However, I also have NAs in my data, which I would like to keep.
Tried:
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 3L, 2L, 1L), REFERENCE = c("TEST1", "TEST2", "TEST3",
"TEST4", "TEST1", "TEST2", "TEST3", "TEST4", "TEST1", "TEST2",
"TEST3", "TEST4", "TEST1", "TEST2", "", "TEST2"), ISS = c(1234L,
1234L, 1111L, 1111L, 1234L, 1111L, 1234L, 1111L, 1234L, NA, 1234L,
1111L, 1234L, 1111L, 1234L, NA), Price = c(10L, NA, 20L, NA,
10L, 12L, NA, 99L, 100L, NA, 100L, 12L, NA, 11L, 0L, 12L)), .Names = c("ID",
"REFERENCE", "ISS", "Price"), row.names = c(NA, -16L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000100788>)
>
> df <- aggregate(df$Price, by=list(ID=df$ID, REFERENCE=df$REFERENCE, ISS=df$ISS), FUN=sum)
Setting na.action = na.pass, gives me:7
Error in aggregate.data.frame(as.data.frame(x), ...) :
no rows to aggregate
As a result I would like to have:
Hence, I would like to keep my NA Data in my df.
Any recommendation how to implement that?
I appreciate your replies!
Instead of using aggregate on a "data.table", we can use the data.table methods. We get the sum of Price (sum(Price, na.rm=TRUE)) after grouping by "ID/REFERENCE/ISS" (by=list(ID, REFERENCE, ISS)]. Order the output by "ID", "REFERENCE" (if needed)
library(data.table)
df[, sum(Price, na.rm=TRUE), by = list(ID, REFERENCE, ISS)][
order(ID, REFERENCE)]
# ID REFERENCE ISS V1
#1: 1 TEST1 1234 10
#2: 1 TEST2 1111 12
#3: 1 TEST2 NA 12
#4: 2 1234 0
#5: 2 TEST2 1234 0
#6: 2 TEST3 1234 100
#7: 3 TEST2 1111 11
#8: 3 TEST3 1111 20
#9: 3 TEST4 1111 111
#10: 4 TEST1 1234 110
#11: 4 TEST4 1111 0