R dividing all elements on the same row - r

Hello i have a dataFrame with 15167 rows like this:
> head(datExpr)
F02234_L1_S1_L001 F02235_L1_S2_L001
ENSG00000182199 1546 1710
ENSG00000157870 3491 2951
ENSG00000077092 35 22
ENSG00000047936 49 12
ENSG00000156282 0 0
ENSG00000183856 265 365
F02236_L1_S3_L001 F02237_L1_S4_L001
ENSG00000182199 3530 3582
ENSG00000157870 2372 2360
ENSG00000077092 111 34
ENSG00000047936 43 6
ENSG00000156282 0 0
ENSG00000183856 768 1489
and a list with 15167 elements like this:
> distances
[[1]]
end_position
1 12883
2 100722
3 119629
4 57174
5 78317
6 453372
7 21772
8 30102
9 116734
..........
I want to divide all in the dataframe by equivalent values of my list. For example 1546/12883,1710/12883, 3530/12883 ... and 3491/100722, 2951/100722...
How can i do that with R?

We could directly divide the list element after extracting as it gets recycled
datExpr/distances[[1]]$end_position
-output
F02234_L1_S1_L001 F02235_L1_S2_L001 F02236_L1_S3_L001 F02237_L1_S4_L001
ENSG00000182199 0.1200031049 0.1327330591 0.2740045021 0.2780408290
ENSG00000157870 0.0346597566 0.0292984651 0.0235499692 0.0234308294
ENSG00000077092 0.0002925712 0.0001839019 0.0009278687 0.0002842120
ENSG00000047936 0.0008570329 0.0002098856 0.0007520901 0.0001049428
ENSG00000156282 0.0000000000 0.0000000000 0.0000000000 0.0000000000
ENSG00000183856 0.0005845090 0.0008050784 0.0016939732 0.0032842787
data
datExpr <- structure(list(F02234_L1_S1_L001 = c(1546L, 3491L, 35L, 49L,
0L, 265L), F02235_L1_S2_L001 = c(1710L, 2951L, 22L, 12L, 0L,
365L), F02236_L1_S3_L001 = c(3530L, 2372L, 111L, 43L, 0L, 768L
), F02237_L1_S4_L001 = c(3582L, 2360L, 34L, 6L, 0L, 1489L)), class = "data.frame", row.names = c("ENSG00000182199",
"ENSG00000157870", "ENSG00000077092", "ENSG00000047936", "ENSG00000156282",
"ENSG00000183856"))
distances <- list(structure(list(end_position = c(12883L, 100722L, 119629L,
57174L, 78317L, 453372L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6")))

Related

apply/create formula to manipulate two dataframes

I have two dataframes in R
df1
chr start end strand bam1 bam2 bam3 bam4 bam5 bam6 bam7 bam8
1 chr1 3531569 3531966 - 2 2 1 4 8 36 21 1
2 chr1 3670538 3672624 - 251 50 170 165 294 259 665 86
3 chr1 4491645 4493854 - 220 46 179 167 275 332 414 77
4 chr1 4496542 4497750 - 115 41 100 67 114 69 42 63
5 chr1 4571267 4572265 - 64 32 77 44 76 130 179 27
6 chr1 4688213 4688719 - 39 10 20 20 14 23 25 17
7 chr1 4688800 4688919 - 20 30 10 20 14 55 17 20
8 chr1 4688800 4688919 - 2 4 6 8 10 12 14 16
9 chr1 4688800 4688919 - 1 2 3 4 5 6 7 8
and
df2
bam_file r1 r2
1 bam1 2 1
2 bam2 9 3
3 bam3 1 4
4 bam4 1 5
5 bam5 1 1
6 bam6 8 6
7 bam7 3 7
8 bam8 3 2
I want to apply following formula (let say X), So that column of df2 X row of df1
((df2[1,2]-df1[1,5])ˆ2 + (df2[2,2]-df1[1,6])ˆ2 + (df2[3,2]-df1[1,7])ˆ2 + (df2[4,2]-df1[1,8])ˆ2 + (df2[5,2]-df1[1,9])ˆ2 + (df2[6,2]-df1[1,10])ˆ2 + (df2[7,2]-df1[1,11])ˆ2 +
(df2[8,2]-df1[1,12])ˆ2)/(ncol(df1)-4)
So the desired output will be
output
r1 r2
1 152.375 144.75
2 89140.25 88467.875
3 57822.75 57413.125
4 6195.125 6148
5 8007.375 7858.75
6 395.75 372.625
7 508.75 543.125
8 60.75 47.125
9 15.5 6.875
I apologize if this appears to be a repetitive question, but I tried and was unable to resolve it (as I am beginner and learning). It would be great to find a solution. Thank you in advance and looking for a positive response.
We could create a sequence column ('rn'), reshape to 'long' format with pivot_longer on the first data, join with the second data ('df2') and do a group by calculation on the 'r1', 'r2' columns in reframe
library(dplyr) # version >= 1.1.0
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols= starts_with("bam"), names_to = "bam_file") %>%
left_join(df2) %>%
reframe(across(r1:r2, ~ sum((value - .x)^2)/n()), .by = "rn")
-output
# A tibble: 9 × 3
rn r1 r2
<int> <dbl> <dbl>
1 1 152. 145.
2 2 89140. 88468.
3 3 57823. 57413.
4 4 6195. 6148
5 5 8007. 7859.
6 6 396. 373.
7 7 509. 543.
8 8 60.8 47.1
9 9 15.5 6.88
data
df1 <- structure(list(chr = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1"), start = c(3531569L, 3670538L,
4491645L, 4496542L, 4571267L, 4688213L, 4688800L, 4688800L, 4688800L
), end = c(3531966L, 3672624L, 4493854L, 4497750L, 4572265L,
4688719L, 4688919L, 4688919L, 4688919L), strand = c("-", "-",
"-", "-", "-", "-", "-", "-", "-"), bam1 = c(2L, 251L, 220L,
115L, 64L, 39L, 20L, 2L, 1L), bam2 = c(2L, 50L, 46L, 41L, 32L,
10L, 30L, 4L, 2L), bam3 = c(1L, 170L, 179L, 100L, 77L, 20L, 10L,
6L, 3L), bam4 = c(4L, 165L, 167L, 67L, 44L, 20L, 20L, 8L, 4L),
bam5 = c(8L, 294L, 275L, 114L, 76L, 14L, 14L, 10L, 5L), bam6 = c(36L,
259L, 332L, 69L, 130L, 23L, 55L, 12L, 6L), bam7 = c(21L,
665L, 414L, 42L, 179L, 25L, 17L, 14L, 7L), bam8 = c(1L, 86L,
77L, 63L, 27L, 17L, 20L, 16L, 8L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
df2 <- structure(list(bam_file = c("bam1", "bam2", "bam3", "bam4", "bam5",
"bam6", "bam7", "bam8"), r1 = c(2L, 9L, 1L, 1L, 1L, 8L, 3L, 3L
), r2 = c(1L, 3L, 4L, 5L, 1L, 6L, 7L, 2L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

Filter a dataframe based on a shorter dataframe

I have the following dataframes I've simplified for practical purposes:
head(coords_int)
seqnames start end
1 chr2 181529780 181533313
2 chr2 98396674 98396940
3 chr5 64919375 64919395
4 chr9 2795948 2797647
5 chr7 138873527 138873574
6 chr4 154736072 154736138
7 chr6 10762723 10769212
8 chr10 93614727 93614773
9 chr17 76539181 76539300
10 chr6 99608741 99608872
11 chr10 47330654 47330828
12 chr10 47331176 47331237
13 chr10 93612154 93612575
14 chr10 84248910 84249043
15 chr17 28547999 28548101
16 chr17 28548592 28548705
17 chr11 46701067 46701141
18 chr16 49847678 49847918
19 chr16 49822670 49822738
head(pdoms_protein)
tx_id seqnames start end width strand exon_id exon_rank cds_ok
1 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
2 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
3 ENST00000339098 2 181566058 181566121 64 - ENSE00003523731 4 TRUE
4 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
5 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
6 ENST00000393504 2 98396674 98396940 267 + ENSE00000963920 8 TRUE
7 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
8 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
9 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
10 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
11 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
12 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
13 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
14 ENST00000422774 7 138881388 138881584 197 - ENSE00001088065 11 TRUE
15 ENST00000422774 7 138879538 138879653 116 - ENSE00001088074 12 TRUE
16 ENST00000422774 7 138871157 138871362 206 - ENSE00001088067 13 TRUE
17 ENST00000336356 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
18 ENST00000502525 4 154744456 154744530 75 + ENSE00002048458 4 FALSE
19 ENST00000507827 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
20 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
21 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
22 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
23 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
protein_start protein_end protein_domain_id protein_domain_source interpro_accession
1 164 339 PS50146 pfscan IPR001206
2 164 339 PF00781 pfam IPR001206
3 164 339 PS50146 pfscan IPR001206
4 171 409 PF16526 pfam IPR032406
5 171 409 SM00100 smart IPR000595
6 502 590 PS50042 pfscan IPR000595
7 16 166 PR00153 prints IPR002130
8 16 166 PR00153 prints IPR002130
9 16 166 PR00153 prints IPR002130
10 16 166 PS50072 pfscan IPR002130
11 16 166 PS00170 scanprosite IPR020892
12 164 179 PR01494 prints IPR003971
13 164 179 PR01491 prints IPR003968
14 1039 1702 PF12877 pfam IPR024606
15 1039 1702 PF12877 pfam IPR024606
16 1039 1702 PF12877 pfam IPR024606
17 44 173 PF04970 pfam IPR007053
18 44 68 PF04970 pfam IPR007053
19 44 173 PF04970 pfam IPR007053
20 4 284 PS50011 pfscan IPR000719
21 4 284 PS00107 scanprosite IPR017441
22 4 284 PS00108 scanprosite IPR008271
23 4 284 SSF56112 superfamily IPR011009
prot_dom_start prot_dom_end gene_name
1 164 339 CERKL
2 170 334 CERKL
3 164 339 CERKL
4 598 668 CNGA3
5 482 606 CNGA3
6 482 596 CNGA3
7 125 140 CWC27
8 97 112 CWC27
9 112 124 CWC27
10 19 166 CWC27
11 49 66 CWC27
12 187 199 KCNV2
13 410 424 KCNV2
14 1039 1702 KIAA1549
15 1039 1702 KIAA1549
16 1039 1702 KIAA1549
17 44 173 LRAT
18 44 68 LRAT
19 44 173 LRAT
20 4 284 MAK
21 10 33 MAK
22 121 133 MAK
23 1 285 MAK
I would like to know if any of the coords_int$start are part of the pdoms_protein$start / pdoms_protein$end range and the same for the coords_int$end and then filter only the data that falls in this category.
I'd tried
library(tidyverse)
pdoms_protein %>%
mutate(dom.ok = 98396674>= start & 98396674<= end) %>%
filter(dom.ok == "TRUE")
And it works but only for one value at a time. Is there a more practical way to do it all at once?
We could do it with fuzzyjoin:
library(fuzzyjoin)
library(dplyr)
long_coords_int <- coords_int %>%
pivot_longer(-seqnames)
fuzzy_left_join(long_coords_int, pdoms_protein[3:4], by = c("value" = "start", "value" = "end"),
match_fun =list(`>=`, `<=`)) %>%
mutate(found = c(NA, "YES")[(!is.na(start)) + 1])
seqnames name value start end found
<chr> <chr> <int> <int> <int> <chr>
1 chr2 start 181529780 NA NA NA
2 chr2 end 181533313 NA NA NA
3 chr2 start 98396674 98396674 98396940 YES
4 chr2 end 98396940 98396674 98396940 YES
5 chr5 start 64919375 NA NA NA
6 chr5 end 64919395 NA NA NA
7 chr9 start 2795948 NA NA NA
8 chr9 end 2797647 NA NA NA
9 chr7 start 138873527 NA NA NA
10 chr7 end 138873574 NA NA NA
# ... with 28 more rows
coords_int <- structure(list(seqnames = c("chr2", "chr2", "chr5", "chr9", "chr7",
"chr4", "chr6", "chr10", "chr17", "chr6", "chr10", "chr10", "chr10",
"chr10", "chr17", "chr17", "chr11", "chr16", "chr16"), start = c(181529780L,
98396674L, 64919375L, 2795948L, 138873527L, 154736072L, 10762723L,
93614727L, 76539181L, 99608741L, 47330654L, 47331176L, 93612154L,
84248910L, 28547999L, 28548592L, 46701067L, 49847678L, 49822670L
), end = c(181533313L, 98396940L, 64919395L, 2797647L, 138873574L,
154736138L, 10769212L, 93614773L, 76539300L, 99608872L, 47330828L,
47331237L, 93612575L, 84249043L, 28548101L, 28548705L, 46701141L,
49847918L, 49822738L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19"))
pdoms_protein <- structure(list(tx_id = c("ENST00000339098", "ENST00000339098",
"ENST00000339098", "ENST00000393504", "ENST00000393504", "ENST00000393504",
"ENST00000381070", "ENST00000381070", "ENST00000381070", "ENST00000381070",
"ENST00000381070", "ENST00000382082", "ENST00000382082", "ENST00000422774",
"ENST00000422774", "ENST00000422774", "ENST00000336356", "ENST00000502525",
"ENST00000507827", "ENST00000313243", "ENST00000313243", "ENST00000313243",
"ENST00000313243"), seqnames = c(2L, 2L, 2L, 2L, 2L, 2L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 7L, 7L, 7L, 4L, 4L, 4L, 6L, 6L, 6L, 6L
), start = c(181573753L, 181573753L, 181566058L, 98395844L, 98395844L,
98396674L, 64774694L, 64774694L, 64774694L, 64781921L, 64781921L,
2718229L, 2718229L, 138881388L, 138879538L, 138871157L, 154744456L,
154744456L, 154744456L, 10830548L, 10830548L, 10830548L, 10830548L
), end = c(181573876L, 181573876L, 181566121L, 98396397L, 98396397L,
98396940L, 64774787L, 64774787L, 64774787L, 64782033L, 64782033L,
2718276L, 2718276L, 138881584L, 138879653L, 138871362L, 154744845L,
154744530L, 154744845L, 10830639L, 10830639L, 10830639L, 10830639L
), width = c(124L, 124L, 64L, 554L, 554L, 267L, 94L, 94L, 94L,
113L, 113L, 48L, 48L, 197L, 116L, 206L, 390L, 75L, 390L, 92L,
92L, 92L, 92L), strand = c("-", "-", "-", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "-", "-", "-", "+", "+", "+", "-",
"-", "-", "-"), exon_id = c("ENSE00003634697", "ENSE00003634697",
"ENSE00003523731", "ENSE00000963920", "ENSE00000963920", "ENSE00000963920",
"ENSE00003522928", "ENSE00003522928", "ENSE00003522928", "ENSE00003582136",
"ENSE00003582136", "ENSE00001490869", "ENSE00001490869", "ENSE00001088065",
"ENSE00001088074", "ENSE00001088067", "ENSE00001344788", "ENSE00002048458",
"ENSE00001344788", "ENSE00003696993", "ENSE00003696993", "ENSE00003696993",
"ENSE00003696993"), exon_rank = c(3L, 3L, 4L, 8L, 8L, 8L, 2L,
2L, 2L, 3L, 3L, 1L, 1L, 11L, 12L, 13L, 2L, 4L, 2L, 2L, 2L, 2L,
2L), cds_ok = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE), protein_start = c(164L, 164L,
164L, 171L, 171L, 502L, 16L, 16L, 16L, 16L, 16L, 164L, 164L,
1039L, 1039L, 1039L, 44L, 44L, 44L, 4L, 4L, 4L, 4L), protein_end = c(339L,
339L, 339L, 409L, 409L, 590L, 166L, 166L, 166L, 166L, 166L, 179L,
179L, 1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 284L, 284L,
284L), protein_domain_id = c("PS50146", "PF00781", "PS50146",
"PF16526", "SM00100", "PS50042", "PR00153", "PR00153", "PR00153",
"PS50072", "PS00170", "PR01494", "PR01491", "PF12877", "PF12877",
"PF12877", "PF04970", "PF04970", "PF04970", "PS50011", "PS00107",
"PS00108", "SSF56112"), protein_domain_source = c("pfscan", "pfam",
"pfscan", "pfam", "smart", "pfscan", "prints", "prints", "prints",
"pfscan", "scanprosite", "prints", "prints", "pfam", "pfam",
"pfam", "pfam", "pfam", "pfam", "pfscan", "scanprosite", "scanprosite",
"superfamily"), interpro_accession = c("IPR001206", "IPR001206",
"IPR001206", "IPR032406", "IPR000595", "IPR000595", "IPR002130",
"IPR002130", "IPR002130", "IPR002130", "IPR020892", "IPR003971",
"IPR003968", "IPR024606", "IPR024606", "IPR024606", "IPR007053",
"IPR007053", "IPR007053", "IPR000719", "IPR017441", "IPR008271",
"IPR011009"), prot_dom_start = c(164L, 170L, 164L, 598L, 482L,
482L, 125L, 97L, 112L, 19L, 49L, 187L, 410L, 1039L, 1039L, 1039L,
44L, 44L, 44L, 4L, 10L, 121L, 1L), prot_dom_end = c(339L, 334L,
339L, 668L, 606L, 596L, 140L, 112L, 124L, 166L, 66L, 199L, 424L,
1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 33L, 133L, 285L),
gene_name = c("CERKL", "CERKL", "CERKL", "CNGA3", "CNGA3",
"CNGA3", "CWC27", "CWC27", "CWC27", "CWC27", "CWC27", "KCNV2",
"KCNV2", "KIAA1549", "KIAA1549", "KIAA1549", "LRAT", "LRAT",
"LRAT", "MAK", "MAK", "MAK", "MAK")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23"))
You could use data.table::foverlaps(), like this:
library(data.table)
setDT(coords_int)
setDT(pdoms_protein)
setkey(coords_int,start,end)
foverlaps(pdoms_protein,coords_int)
Also see package IRanges
Not tested, but I think you could make a function like this
pdoms_protein %>%
mutate(dom.ok = isok(start,end)) %>%
filter(dom.ok == "TRUE")
isok <- function(local_start, local_end) {
df <- coords_int %>%
filter(start >= local_start & end <= local_end)
return count(df) > 0
}

How to Create Values based on Start-Stop Info in Separate Column

I have a very messy dataset created by a research device. This data shows a physiological measure ("Physio") for every few milliseconds ("Time"). The output lists several user messages, such as when a trial starts ("START_TRIAL n"), when a trial ends ("STOP_TRIAL"), and other random things that may be of interest to the researcher. Some times the "START_TRIAL n" message is repeated consecutively, and sometimes when there is no message, a simple "0" is left in what would otherwise be a blank cell.
I am hoping to create a new column that will signify which trial the current case belongs to. (See example data below).
Is there a way to do this with dplyr and mutate? I am wondering if I may need to do an if-then statement that changes the values of a new column for every case, but surely there's a more elegant solution? (Thank you in advance for helping out this newbie!)
Time Physio Cond
1 34 START_TRIAL 1
2 33 0
3 25 RANDOM_MSG
4 43 STOP_TRIAL
5 27 START_TRIAL 2
6 54 START_TRIAL 2
7 32 0
8 54 RANDOM_MSG
9 23 STOP_TRIAL
structure(list(Time = 1:9, Physio = c(34L, 33L, 25L, 43L, 27L,
54L, 32L, 54L, 23L), Cond = structure(c(4L, 2L, 3L, 6L, 5L, 5L,
2L, 3L, 6L), .Label = c("", "0", "RANDOM_MSG", "START_TRIAL 1",
"START_TRIAL 2", "STOP_TRIAL"), class = "factor")), .Names = c("Time",
"Physio", "Cond"), row.names = c(NA, 9L), class = "data.frame")
into
Time Physio Trial Cond
1 34 1 START_TRIAL 1
2 33 1 0
3 25 1 RANDOM_MSG
4 43 1 STOP_TRIAL
5 27 2 START_TRIAL 2
6 54 2 START_TRIAL 2
7 32 2 0
8 54 2 RANDOM_MSG
9 23 2 STOP_TRIAL
structure(list(Time = 1:9, Physio = c(34L, 33L, 25L, 43L, 27L,
54L, 32L, 54L, 23L), Trial = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Cond = structure(c(4L, 2L, 3L, 6L, 5L, 5L, 2L, 3L, 6L), .Label = c("",
"0", "RANDOM_MSG", "START_TRIAL 1", "START_TRIAL 2", "STOP_TRIAL"
), class = "factor")), .Names = c("Time", "Physio", "Trial",
"Cond"), row.names = c(NA, 9L), class = "data.frame")
One option would be to identify the 'START_TRIAL' with grep, do a match to get the index and fill the NA elements with the previous non-NA adjacent element
library(dplyr)
library(tidyr)
df1 %>%
mutate(Trial = match(PhysioCond, unique(grep("START_TRIAL",
PhysioCond, value = TRUE)))) %>%
fill(Trial)
# Time PhysioCond Trial
#1 34 START_TRIAL 1 1
#2 33 0 1
#3 25 RANDOM_MSG 1
#4 43 STOP_TRIAL 1
#5 27 START_TRIAL 2 2
#6 54 START_TRIAL 2 2
#7 32 0 2
#8 54 RANDOM_MSG 2
#9 23 STOP_TRIAL 2
NOTE: Not clear about the column name, but the logic should work well
data
df1 <- structure(list(Time = c(34L, 33L, 25L, 43L, 27L, 54L, 32L, 54L,
23L), PhysioCond = c("START_TRIAL 1", "0", "RANDOM_MSG", "STOP_TRIAL",
"START_TRIAL 2", "START_TRIAL 2", "0", "RANDOM_MSG", "STOP_TRIAL"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9"))

Converting a list of vectors and numbers (from replicate) into a data frame

After running the replicate() function [a close relative of lapply()] on some data I ended up with an output that looks like this
myList <- structure(list(c(55L, 13L, 61L, 38L, 24L), 6.6435972422341, c(37L, 1L, 57L, 8L, 40L), 5.68336098665417, c(19L, 10L, 23L, 52L, 60L ),
5.80430476680636, c(39L, 47L, 60L, 14L, 3L), 6.67554407822367,
c(57L, 8L, 53L, 6L, 2L), 5.67149520387856, c(40L, 8L, 21L,
17L, 13L), 5.88446015238962, c(52L, 21L, 22L, 55L, 54L),
6.01685181395007, c(12L, 7L, 1L, 2L, 14L), 6.66299948053721,
c(41L, 46L, 21L, 30L, 6L), 6.67239635545512, c(46L, 31L,
11L, 44L, 32L), 6.44174324641076), .Dim = c(2L, 10L), .Dimnames = list(
c("reps", "score"), NULL))
In this case the vectors of integers are indexes that went into a function that I won't get into and the scalar-floats are scores.
I'd like a data frame that looks like
Index 1 Index 2 Index 3 Index 4 Index 5 Score
55 13 61 38 24 6.64
37 1 57 8 40 5.68
19 10 23 52 60 5.80
and so on.
Alternatively, a matrix of the indexes and an array of the values would be fine too.
Things that haven't worked for me.
data.frame(t(random.out)) # just gives a data frame with a column of vectors and another of scalars
cbind(t(random.out)) # same as above
do.call(rbind, random.out) # intersperses vectors and scalars
I realize other people have similar problems,
eg. Convert list of vectors to data frame
but I can't quite find an example with this particular kind of vectors and scalars together.
myList[1,] is a list of vectors, so you can combine them into a matrix with do.call and rbind. myList[2,] is a list of single scores, so you can combine them into a vector with unlist:
cbind(as.data.frame(do.call(rbind, myList[1,])), Score=unlist(myList[2,]))
# V1 V2 V3 V4 V5 Score
# 1 55 13 61 38 24 6.643597
# 2 37 1 57 8 40 5.683361
# 3 19 10 23 52 60 5.804305
# 4 39 47 60 14 3 6.675544
# 5 57 8 53 6 2 5.671495
# 6 40 8 21 17 13 5.884460
# 7 52 21 22 55 54 6.016852
# 8 12 7 1 2 14 6.662999
# 9 41 46 21 30 6 6.672396
# 10 46 31 11 44 32 6.441743

Facing difficulty in convert a data.frame to time series object in R?

I am a novice in R language. I am having text file separated by tab available with sales data for each day. The format will be like product-id, day0, day1, day2, day3 and so on. The part of the input file given below
productid 0 1 2 3 4 5 6
1 53 40 37 45 69 105 62
4 0 0 2 4 0 8 0
5 57 133 60 126 90 87 107
6 108 130 143 92 88 101 66
10 0 0 2 0 4 0 36
11 17 22 16 15 45 32 36
I used code below to read a file
pdInfo <- read.csv("products.txt",header = TRUE, sep="\t")
This allows to read the entire file and variable x is a data frame. I would like to change data.frame x to time series object in order for the further processing.On a stationary test, Dickey–Fuller test (ADF) it shows an error. I tried the below code
x <- ts(data.matrix(pdInfo),frequency = 1)
adf <- adf.test(x)
error: Error in adf.test(x) : x is not a vector or univariate time series
Thanks in advance for the suggestions
In R, time series are usually in the form "one row per date", where your data is in the form "one column per date". You probably need to transpose the data before you convert to a ts object.
First transpose it:
y= t(pdInfo)
Then make the top row (being the product id's) into the row titles
colnames(y) = y[1,]
y= y[-1,] # to drop the first row
This should work:
x = ts(y, frequency = 1)
library(purrr)
library(dplyr)
library(tidyr)
library(tseries)
# create the data
df <- structure(list(productid = c(1L, 4L, 5L, 6L, 10L, 11L),
X0 = c(53L, 0L, 57L, 108L, 0L, 17L),
X1 = c(40L, 0L, 133L, 130L, 0L, 22L),
X2 = c(37L, 2L, 60L, 143L, 2L, 16L),
X3 = c(45L, 4L, 126L, 92L, 0L, 15L),
X4 = c(69L, 0L, 90L, 88L, 4L, 45L),
X5 = c(105L, 8L, 87L, 101L, 0L, 32L),
X6 = c(62L, 0L, 107L, 66L, 36L, 36L)),
.Names = c("productid", "0", "1", "2", "3", "4", "5", "6"),
class = "data.frame", row.names = c(NA, -6L))
# apply adf.test to each productid and return p.value
adfTest <- df %>% gather(key = day, value = sales, -productid) %>%
arrange(productid, day) %>%
group_by(productid) %>%
nest() %>%
mutate(adf = data %>% map(., ~adf.test(as.ts(.$sales)))
,adf.p.value = adf %>% map_dbl(., "p.value")) %>%
select(productid, adf.p.value)

Resources